Preprocessing and Exploratory Data Analysis

a) Missing values

train <- read.table("../data/rawdata/adult.data.txt", sep = ",", na.strings = "?",
                    strip.white = T)
test <- read.table("../data/rawdata/adult.test.txt", sep = ",", na.strings = "?",
                   strip.white = T)

dim(train)
## [1] 32561    15
dim(test)
## [1] 16281    15
colnames(train) <- c("age", "workclass", "fnlwgt", "education", "education-num",
                     "marital-status", "occupation", "relationship", "race", "sex",
                     "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")

colnames(test) <- c("age", "workclass", "fnlwgt", "education", "education-num",
                     "marital-status", "occupation", "relationship", "race", "sex",
                     "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")



#Find missing values and NAs for training set.
for(i in 1:ncol(train)){
  cat("<names of NA rows in", colnames(train)[i], "variable>", "\n")
  cat(rownames(train)[is.na(train[, i])], "\n")
  cat("Number of NA values:  ", length(rownames(train)[is.na(train[, i])]), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain missing values in", colnames(train)[i], "variable>", "\n")
  cat(rownames(train[which(train[, i] == ""), ]), "\n")
  cat("Number of Missing values :  ", length(rownames(train[which(train[, i] == ""), ])), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain ? values in", colnames(train)[i], "variable>", "\n")
  cat(rownames(train[which(train[, i] == " ?"), ]), "\n")
  cat("Number of ? values :  ", length(rownames(train[which(train[, i] == " ?"), ])), "\n")
  print("======================================")
  print("======================================")
}
## <names of NA rows in age variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable> 
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32311 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543 
## Number of NA values:   1836 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable> 
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5362 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10846 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14773 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20338 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23233 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32305 32311 32315 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543 
## Number of NA values:   1843 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable> 
## 15 39 52 62 94 246 250 298 394 454 558 713 726 730 778 781 888 956 1027 1037 1116 1153 1159 1200 1225 1253 1327 1349 1392 1555 1558 1582 1594 1677 1712 1739 1819 1901 1991 2016 2100 2105 2182 2372 2513 2514 2519 2550 2573 2588 2592 2640 2718 2736 2776 2795 2910 2927 3024 3108 3132 3165 3167 3188 3201 3233 3248 3257 3462 3485 3496 3533 3580 3637 3835 3857 3859 4007 4157 4173 4198 4245 4302 4327 4397 4406 4463 4511 4579 4600 4640 4657 4659 4672 4773 4787 4828 5082 5181 5186 5202 5235 5310 5348 5375 5402 5451 5541 5648 5664 5684 5710 5824 5842 5855 5964 6006 6060 6130 6177 6187 6243 6320 6361 6365 6377 6396 6534 6677 6738 6845 7046 7073 7081 7097 7154 7167 7177 7254 7285 7328 7346 7399 7476 7616 7635 7689 7851 7862 7863 7903 7965 7991 8146 8161 8208 8226 8283 8357 8366 8478 8872 8904 8916 9016 9041 9238 9367 9419 9504 9538 9560 9581 9617 9625 9740 9786 9800 9850 9867 9986 10012 10063 10183 10185 10219 10289 10344 10354 10404 10409 10575 10635 10648 10675 10763 10778 10783 11148 11188 11222 11285 11301 11424 11447 11478 11596 11615 11653 11660 11984 11989 12005 12083 12115 12173 12261 12281 12316 12330 12363 12471 12561 12644 12656 12691 12696 12717 12749 12831 12900 12960 12974 12997 13089 13199 13202 13282 13306 13500 13604 13692 13748 13769 13818 13821 13827 13828 13898 13914 13919 13972 14044 14086 14103 14196 14235 14247 14341 14369 14411 14460 14563 14578 14583 14585 14593 14858 15024 15037 15137 15153 15162 15198 15220 15445 15476 15529 15595 15610 15614 15673 15679 15693 15735 15793 15864 15932 15933 15954 15989 16037 16080 16109 16142 16143 16232 16261 16267 16329 16382 16418 16440 16489 16501 16636 16648 16839 16863 16976 17022 17108 17194 17202 17275 17379 17453 17482 17483 17624 17648 17895 18066 18234 18278 18366 18413 18439 18460 18556 18586 18616 18673 18678 18907 18910 18983 19038 19047 19056 19170 19246 19257 19300 19317 19327 19347 19352 19415 19491 19533 19627 19677 19710 19728 19769 19785 19788 19947 19998 20204 20285 20334 20359 20465 20481 20500 20532 20633 20639 20658 20659 20717 20748 20848 21063 21109 21127 21135 21196 21227 21265 21383 21394 21532 21542 21557 21669 21723 21819 22003 22069 22107 22231 22242 22265 22318 22352 22430 22475 22541 22562 22615 22640 22678 22743 22772 22789 22791 22862 22908 22982 23033 23116 23174 23237 23285 23435 23441 23467 23471 23566 23638 23688 23705 23730 23785 23798 23893 23916 24214 24458 24466 24573 24593 24607 24663 24696 24751 24833 24891 24892 24924 24961 24981 25047 25106 25113 25236 25242 25276 25297 25314 25343 25360 25459 25479 25492 25505 25550 25575 25620 25630 25842 25871 26008 26198 26222 26235 26272 26297 26333 26364 26378 26406 26447 26461 26570 26617 26662 26763 26801 26901 26923 26941 26980 27020 27069 27134 27142 27300 27306 27377 27384 27670 28019 28045 28108 28125 28195 28196 28197 28221 28336 28344 28432 28483 28501 28506 28590 28619 28629 28689 28706 28836 28842 28913 28933 28938 29030 29034 29099 29105 29213 29256 29324 29358 29378 29402 29441 29524 29593 29681 29683 29739 29778 29787 29889 29982 30011 30106 30111 30171 30231 30275 30277 30303 30330 30370 30583 30639 30657 30671 30701 30774 30822 30903 30923 31090 31129 31337 31360 31388 31397 31469 31556 31638 31642 31702 31797 31945 32091 32170 32214 32233 32255 32308 32414 32450 32470 32493 32511 32526 
## Number of NA values:   583 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
#Find missing values and NAs for testing set.
for(i in 1:ncol(test)){
  cat("<names of NA rows in", colnames(test)[i], "variable>", "\n")
  cat(rownames(test)[is.na(test[, i])], "\n")
  cat("Number of NA values:  ", length(rownames(test)[is.na(test[, i])]), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain missing values in", colnames(test)[i], "variable>", "\n")
  cat(rownames(test[which(test[, i] == ""), ]), "\n")
  cat("Number of Missing values :  ", length(rownames(test[which(test[, i] == ""), ])), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain ? values in", colnames(test)[i], "variable>", "\n")
  cat(rownames(test[which(test[, i] == " ?"), ]), "\n")
  cat("Number of ? values :  ", length(rownames(test[which(test[, i] == " ?"), ])), "\n")
  print("======================================")
  print("======================================")
}
## <names of NA rows in age variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable> 
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278 
## Number of NA values:   963 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable> 
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8786 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11608 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13899 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278 
## Number of NA values:   966 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable> 
## 20 66 84 189 254 306 330 404 421 472 516 649 666 688 844 1009 1039 1164 1334 1365 1406 1616 1644 1801 1822 1823 1832 1941 2061 2096 2107 2161 2227 2264 2305 2318 2324 2350 2477 2489 2552 2585 2613 2630 2697 2703 2775 2886 3061 3075 3122 3160 3222 3440 3460 3485 3508 3672 3678 3730 3762 3786 3854 3867 4187 4409 4540 4545 4608 4643 4649 4697 4728 4748 4764 4911 4923 5053 5126 5149 5152 5171 5181 5420 5469 5497 5648 5662 5717 5732 5829 5837 5944 5973 6034 6048 6054 6180 6206 6208 6234 6372 6403 6518 6587 6762 6776 6798 6801 6863 6871 6876 7017 7047 7060 7167 7206 7232 7288 7355 7443 7598 7601 7677 7708 7721 7750 7817 8029 8044 8078 8161 8183 8265 8369 8378 8433 8600 8622 8634 8700 8774 8849 8938 8976 9057 9145 9180 9200 9240 9244 9254 9263 9297 9335 9340 9354 9358 9415 9436 9497 9552 9567 9581 9626 9635 9699 9740 9874 9957 9983 10048 10151 10157 10202 10208 10267 10334 10346 10356 10364 10409 10475 10476 10509 10711 10739 10842 11130 11314 11348 11390 11407 11610 11686 11733 11749 11762 11784 11889 11946 12371 12386 12398 12415 12436 12456 12506 12577 12579 12607 12626 12648 12725 12780 12797 12911 12990 13171 13241 13254 13293 13311 13362 13547 13550 13575 13614 13693 13721 13746 13760 13764 13792 13926 13931 13934 13971 13980 14005 14029 14030 14072 14189 14203 14225 14263 14334 14373 14407 14446 14547 14585 14611 14652 14732 15006 15015 15084 15091 15099 15185 15234 15321 15350 15397 15421 15481 15594 15685 15712 16044 16091 16266 
## Number of NA values:   274 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
#Get percentage of missing values
apply(train, 2, function(x) sum(is.na(x)) / length(x)) * 100
##            age      workclass         fnlwgt      education  education-num 
##       0.000000       5.638647       0.000000       0.000000       0.000000 
## marital-status     occupation   relationship           race            sex 
##       0.000000       5.660146       0.000000       0.000000       0.000000 
##   capital-gain   capital-loss hours-per-week native-country         income 
##       0.000000       0.000000       0.000000       1.790486       0.000000
apply(test, 2, function(x) sum(is.na(x)) / length(x)) * 100
##            age      workclass         fnlwgt      education  education-num 
##       0.000000       5.914870       0.000000       0.000000       0.000000 
## marital-status     occupation   relationship           race            sex 
##       0.000000       5.933296       0.000000       0.000000       0.000000 
##   capital-gain   capital-loss hours-per-week native-country         income 
##       0.000000       0.000000       0.000000       1.682943       0.000000
#MICE package to see the pattern 
md.pattern(train)
##       age fnlwgt education education-num marital-status relationship race
## 30162   1      1         1             1              1            1    1
##     7   1      1         1             1              1            1    1
##   556   1      1         1             1              1            1    1
##  1809   1      1         1             1              1            1    1
##    27   1      1         1             1              1            1    1
##         0      0         0             0              0            0    0
##       sex capital-gain capital-loss hours-per-week income native-country
## 30162   1            1            1              1      1              1
##     7   1            1            1              1      1              1
##   556   1            1            1              1      1              0
##  1809   1            1            1              1      1              1
##    27   1            1            1              1      1              0
##         0            0            0              0      0            583
##       workclass occupation     
## 30162         1          1    0
##     7         1          0    1
##   556         1          1    1
##  1809         0          0    2
##    27         0          0    3
##            1836       1843 4262
plot <- aggr(train, col = c('blue', 'yellow'),
                    numbers = TRUE, sortVars = TRUE,
                    labels = names(train), cex.axis = .7,
                    gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable      Count
##      occupation 0.05660146
##       workclass 0.05638647
##  native-country 0.01790486
##             age 0.00000000
##          fnlwgt 0.00000000
##       education 0.00000000
##   education-num 0.00000000
##  marital-status 0.00000000
##    relationship 0.00000000
##            race 0.00000000
##             sex 0.00000000
##    capital-gain 0.00000000
##    capital-loss 0.00000000
##  hours-per-week 0.00000000
##          income 0.00000000
md.pattern(test)
##       age fnlwgt education education-num marital-status relationship race
## 15060   1      1         1             1              1            1    1
##     3   1      1         1             1              1            1    1
##   255   1      1         1             1              1            1    1
##   944   1      1         1             1              1            1    1
##    19   1      1         1             1              1            1    1
##         0      0         0             0              0            0    0
##       sex capital-gain capital-loss hours-per-week income native-country
## 15060   1            1            1              1      1              1
##     3   1            1            1              1      1              1
##   255   1            1            1              1      1              0
##   944   1            1            1              1      1              1
##    19   1            1            1              1      1              0
##         0            0            0              0      0            274
##       workclass occupation     
## 15060         1          1    0
##     3         1          0    1
##   255         1          1    1
##   944         0          0    2
##    19         0          0    3
##             963        966 2203
plot <- aggr(test, col = c('blue', 'yellow'),
                    numbers = TRUE, sortVars = TRUE,
                    labels = names(test), cex.axis = .7,
                    gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable      Count
##      occupation 0.05933296
##       workclass 0.05914870
##  native-country 0.01682943
##             age 0.00000000
##          fnlwgt 0.00000000
##       education 0.00000000
##   education-num 0.00000000
##  marital-status 0.00000000
##    relationship 0.00000000
##            race 0.00000000
##             sex 0.00000000
##    capital-gain 0.00000000
##    capital-loss 0.00000000
##  hours-per-week 0.00000000
##          income 0.00000000
# Hmisc package to impute missing values
# ww <- aregImpute(~ age + workclass + fnlwgt + education + `education-num` + `marital-status` +
#                    occupation + relationship + race + sex + `capital-gain` + `capital-loss` +
#                    `hours-per-week` + income,
#                  data = train, n.impute = 5, group = "income")



#mlr package to impute missing values
# newworkclass <- impute(train[,2], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
# 
# newoccupation <- impute(train[,7], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
# 
# newcountry <- impute(train[,14], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")



#missForest package to impute missing values
# foresting <- missForest(train, maxiter = 5, ntree = 100)
# foresting$OOBerror
# newtrain <- foresting$ximp
# write.csv(newtrain, file = "../data/cleandata/newtrain.csv", col.names = T, row.names = F)
newtrain <- read.csv("../data/cleandata/newtrain.csv", header = T)
dim(newtrain)
## [1] 32561    15
# foresting2 <- missForest(test, maxiter = 5, ntree = 100)
# foresting2$OOBerror
# newtest <- foresting2$ximp
# write.csv(newtest, file = "../data/cleandata/newtest.csv", col.names = T, row.names = F)
newtest <- read.csv("../data/cleandata/newtest.csv", header = T)
dim(newtest)
## [1] 16281    15
#Check whether the data is messed up while imputing missing values
#They should never show 0, as we are supposed to see only missing value has been changed...
#Compare NA with new number in new data set should show NA, not 0.
t <- matrix(0, 1, ncol(train))
for(i in 1:20){
  a <- sample.int(nrow(newtrain), 1)
  t <- rbind(t, (newtrain[a, ] == train[a, ]))
}
t <- t[-1, ]
t
##       age workclass fnlwgt education education.num marital.status
## 4406    1         1      1         1             1              1
## 18590   1         1      1         1             1              1
## 11515   1         1      1         1             1              1
## 2378    1         1      1         1             1              1
## 6735    1        NA      1         1             1              1
## 919     1         1      1         1             1              1
## 7868    1         1      1         1             1              1
## 16347   1        NA      1         1             1              1
## 8588    1         1      1         1             1              1
## 10709   1         1      1         1             1              1
## 16049   1         1      1         1             1              1
## 6272    1         1      1         1             1              1
## 17338   1         1      1         1             1              1
## 27212   1         1      1         1             1              1
## 17508   1         1      1         1             1              1
## 30001   1         1      1         1             1              1
## 14807   1         1      1         1             1              1
## 16931   1         1      1         1             1              1
## 9813    1         1      1         1             1              1
## 29019   1         1      1         1             1              1
##       occupation relationship race sex capital.gain capital.loss
## 4406           1            1    1   1            1            1
## 18590          1            1    1   1            1            1
## 11515          1            1    1   1            1            1
## 2378           1            1    1   1            1            1
## 6735          NA            1    1   1            1            1
## 919            1            1    1   1            1            1
## 7868           1            1    1   1            1            1
## 16347         NA            1    1   1            1            1
## 8588           1            1    1   1            1            1
## 10709          1            1    1   1            1            1
## 16049          1            1    1   1            1            1
## 6272           1            1    1   1            1            1
## 17338          1            1    1   1            1            1
## 27212          1            1    1   1            1            1
## 17508          1            1    1   1            1            1
## 30001          1            1    1   1            1            1
## 14807          1            1    1   1            1            1
## 16931          1            1    1   1            1            1
## 9813           1            1    1   1            1            1
## 29019          1            1    1   1            1            1
##       hours.per.week native.country income
## 4406               1             NA      1
## 18590              1              1      1
## 11515              1              1      1
## 2378               1              1      1
## 6735               1              1      1
## 919                1              1      1
## 7868               1              1      1
## 16347              1              1      1
## 8588               1              1      1
## 10709              1              1      1
## 16049              1              1      1
## 6272               1              1      1
## 17338              1              1      1
## 27212              1              1      1
## 17508              1              1      1
## 30001              1              1      1
## 14807              1              1      1
## 16931              1              1      1
## 9813               1              1      1
## 29019              1              1      1
t2 <- matrix(0, 1, ncol(test))
for(i in 1:20){
  a <- sample.int(nrow(newtest), 1)
  t2 <- rbind(t2, (newtest[a, ] == test[a, ]))
}
t2 <- t2[-1, ]
t2
##       age workclass fnlwgt education education.num marital.status
## 6297    1         1      1         1             1              1
## 12202   1         1      1         1             1              1
## 11310   1         1      1         1             1              1
## 14712   1         1      1         1             1              1
## 9277    1         1      1         1             1              1
## 3059    1         1      1         1             1              1
## 6364    1         1      1         1             1              1
## 188     1         1      1         1             1              1
## 13893   1         1      1         1             1              1
## 11202   1        NA      1         1             1              1
## 317     1        NA      1         1             1              1
## 11257   1         1      1         1             1              1
## 16083   1         1      1         1             1              1
## 10765   1         1      1         1             1              1
## 10743   1         1      1         1             1              1
## 6194    1         1      1         1             1              1
## 3742    1         1      1         1             1              1
## 592     1         1      1         1             1              1
## 8118    1         1      1         1             1              1
## 15107   1         1      1         1             1              1
##       occupation relationship race sex capital.gain capital.loss
## 6297           1            1    1   1            1            1
## 12202          1            1    1   1            1            1
## 11310          1            1    1   1            1            1
## 14712          1            1    1   1            1            1
## 9277           1            1    1   1            1            1
## 3059           1            1    1   1            1            1
## 6364           1            1    1   1            1            1
## 188            1            1    1   1            1            1
## 13893          1            1    1   1            1            1
## 11202         NA            1    1   1            1            1
## 317           NA            1    1   1            1            1
## 11257          1            1    1   1            1            1
## 16083          1            1    1   1            1            1
## 10765          1            1    1   1            1            1
## 10743          1            1    1   1            1            1
## 6194           1            1    1   1            1            1
## 3742           1            1    1   1            1            1
## 592            1            1    1   1            1            1
## 8118           1            1    1   1            1            1
## 15107          1            1    1   1            1            1
##       hours.per.week native.country income
## 6297               1              1      1
## 12202              1              1      1
## 11310              1              1      1
## 14712              1              1      1
## 9277               1              1      1
## 3059               1              1      1
## 6364               1              1      1
## 188                1              1      1
## 13893              1              1      1
## 11202              1              1      1
## 317                1              1      1
## 11257              1              1      1
## 16083              1              1      1
## 10765              1              1      1
## 10743              1              1      1
## 6194               1              1      1
## 3742               1              1      1
## 592                1              1      1
## 8118               1              1      1
## 15107              1              1      1

\(\\\)

\(\\\)

b) 2 - 5 EDAs

#See structure and summaries before removing outliers
str(newtest)
## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
##  $ fnlwgt        : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education     : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
##  $ income        : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
##       age                   workclass         fnlwgt       
##  Min.   :17.00   Private         :11963   Min.   :  13492  
##  1st Qu.:28.00   Self-emp-not-inc: 1433   1st Qu.: 116736  
##  Median :37.00   Local-gov       : 1090   Median : 177831  
##  Mean   :38.77   State-gov       :  710   Mean   : 189436  
##  3rd Qu.:48.00   Self-emp-inc    :  594   3rd Qu.: 238384  
##  Max.   :90.00   Federal-gov     :  481   Max.   :1490400  
##                  (Other)         :   10                    
##         education    education.num                 marital.status
##  HS-grad     :5283   Min.   : 1.00   Divorced             :2190  
##  Some-college:3587   1st Qu.: 9.00   Married-AF-spouse    :  14  
##  Bachelors   :2670   Median :10.00   Married-civ-spouse   :7403  
##  Masters     : 934   Mean   :10.07   Married-spouse-absent: 210  
##  Assoc-voc   : 679   3rd Qu.:12.00   Never-married        :5434  
##  11th        : 637   Max.   :16.00   Separated            : 505  
##  (Other)     :2491                   Widowed              : 525  
##            occupation           relationship                  race      
##  Prof-specialty :2111   Husband       :6523   Amer-Indian-Eskimo:  159  
##  Craft-repair   :2040   Not-in-family :4278   Asian-Pac-Islander:  480  
##  Exec-managerial:2035   Other-relative: 525   Black             : 1561  
##  Adm-clerical   :1967   Own-child     :2513   Other             :  135  
##  Sales          :1921   Unmarried     :1679   White             :13946  
##  Other-service  :1825   Wife          : 763                             
##  (Other)        :4382                                                   
##      sex         capital.gain    capital.loss    hours.per.week 
##  Female: 5421   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
##  Male  :10860   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
##                 Median :    0   Median :   0.0   Median :40.00  
##                 Mean   : 1082   Mean   :  87.9   Mean   :40.39  
##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
##                 Max.   :99999   Max.   :3770.0   Max.   :99.00  
##                                                                 
##        native.country     income     
##  United-States:14892   <=50K.:12435  
##  Mexico       :  311   >50K. : 3846  
##  Philippines  :  111                 
##  Puerto-Rico  :   70                 
##  Germany      :   69                 
##  Canada       :   61                 
##  (Other)      :  767
str(newtrain)
## 'data.frame':    32561 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
##       age                   workclass         fnlwgt       
##  Min.   :17.00   Private         :24068   Min.   :  12285  
##  1st Qu.:28.00   Self-emp-not-inc: 2776   1st Qu.: 117827  
##  Median :37.00   Local-gov       : 2193   Median : 178356  
##  Mean   :38.58   State-gov       : 1352   Mean   : 189778  
##  3rd Qu.:48.00   Self-emp-inc    : 1164   3rd Qu.: 237051  
##  Max.   :90.00   Federal-gov     :  985   Max.   :1484705  
##                  (Other)         :   23                    
##         education     education.num                 marital.status 
##  HS-grad     :10501   Min.   : 1.00   Divorced             : 4443  
##  Some-college: 7291   1st Qu.: 9.00   Married-AF-spouse    :   23  
##  Bachelors   : 5355   Median :10.00   Married-civ-spouse   :14976  
##  Masters     : 1723   Mean   :10.08   Married-spouse-absent:  418  
##  Assoc-voc   : 1382   3rd Qu.:12.00   Never-married        :10683  
##  11th        : 1175   Max.   :16.00   Separated            : 1025  
##  (Other)     : 5134                   Widowed              :  993  
##            occupation           relationship                   race      
##  Prof-specialty :4295   Husband       :13193   Amer-Indian-Eskimo:  311  
##  Craft-repair   :4162   Not-in-family : 8305   Asian-Pac-Islander: 1039  
##  Exec-managerial:4129   Other-relative:  981   Black             : 3124  
##  Adm-clerical   :3992   Own-child     : 5068   Other             :  271  
##  Sales          :3715   Unmarried     : 3446   White             :27816  
##  Other-service  :3696   Wife          : 1568                             
##  (Other)        :8572                                                    
##      sex         capital.gain    capital.loss    hours.per.week 
##  Female:10771   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
##  Male  :21790   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
##                 Median :    0   Median :   0.0   Median :40.00  
##                 Mean   : 1078   Mean   :  87.3   Mean   :40.44  
##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
##                 Max.   :99999   Max.   :4356.0   Max.   :99.00  
##                                                                 
##        native.country    income     
##  United-States:29675   <=50K:24720  
##  Mexico       :  657   >50K : 7841  
##  Philippines  :  211                
##  Germany      :  137                
##  Canada       :  121                
##  Puerto-Rico  :  114                
##  (Other)      : 1646
#Deal with outliers for training sets
continuouscol <- c(1, 3, 5, 11, 12, 13) #subset continous variables

par(mfrow = c(2, 3))
for(i in continuouscol){
  boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i])),
          xlab = colnames(newtrain[i]))
}

for(i in continuouscol){
  den_acc <- density(newtrain[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtrain[i])))
  polygon(den_acc, col = "red", border = "blue")
}

outlierstrain <- list()
for(i in continuouscol){
  outliers <- boxplot.stats(newtrain[, i])$out
  numbers <- length(outliers)
  outlierstrain[[i]] <- list(outliers, numbers)
}
head(outlierstrain)
## [[1]]
## [[1]][[1]]
##   [1] 79 90 80 81 90 88 90 90 80 90 81 82 79 81 80 83 90 90 79 81 90 90 80
##  [24] 90 90 79 79 84 90 80 90 81 83 84 81 79 85 82 79 80 90 90 90 84 80 90
##  [47] 90 79 84 90 79 90 90 90 82 81 90 84 79 81 82 81 80 90 80 84 82 79 90
##  [70] 84 90 83 79 81 80 79 80 79 80 90 90 80 90 90 81 83 82 90 90 81 80 80
##  [93] 90 79 80 82 85 80 79 90 81 79 80 79 81 82 88 90 82 88 84 83 79 86 90
## [116] 90 82 83 81 79 90 80 81 79 84 84 79 90 80 81 81 81 90 87 90 80 80 82
## [139] 90 90 85 82 81
## 
## [[1]][[2]]
## [1] 143
## 
## 
## [[2]]
## NULL
## 
## [[3]]
## [[3]][[1]]
##   [1]  544091  507875  446839  432376  494223  428030  483777  633742
##   [9]  523910  635913  538583  477983  425161  860348  423158  481060
##  [17]  416103  445382 1033222  426017  543162  433665  462440  556660
##  [25]  430828  475028  420537  680390  499233  543028  465507  526968
##  [33]  767403  431192  520586  445824  416745  444304  441454  421132
##  [41]  795830  419721  509350  467108  444554  449257  441620  563883
##  [49]  431745  436006  473040  910398  451940  428350  421871  443040
##  [57]  420895  496743  429507  418324  538319  508336  445382  483201
##  [65]  452205  672412  473547  421065  505119  460046  549430  441591
##  [73]  438696  488720  482082  460835  519627  675421  481987  758700
##  [81]  509364  432565  490332  466224  446219  423460  509364  656036
##  [89]  443508  566117  436253  454508  427686  548510  545483  503012
##  [97]  573583  511361  454941  452405  716416  480861  498785  637222
## [105]  430084  423770  417657  446358  457402  664821  462890  598606
## [113]  457237  465326  503923  572751  580248  519006  617021  437994
## [121]  596776  588905  517995  640383  504725  423863  420917  470663
## [129]  611029  437851  495888  549341  421837  746786  550848  510072
## [137]  449432  430471  416129  511331  446559  452640  456399  469705
## [145]  656036  488720  434710  449354  425627  417136  460835  416338
## [153]  424079  423561  688355  587310  628797  421449  424988  443508
## [161]  632613  499249  445758  416164  473133  450580  506329  445168
## [169]  516337  432376  571853 1184622  913447  476573  632593  595000
## [177]  703067  484475  476391  749105  459465  543922  420282  498325
## [185]  447579  420749  482732  437281  427965  505980  549349  496025
## [193]  562558  642830  435022  443546  523095  436770  436493  704108
## [201]  557082  477106  471452  426001  464536  451996  505980  454614
## [209]  473748  506858  434102  454989  537222  595000  454508  577521
## [217]  424012  431426  604506  564135  427781  469907  503675  444089
## [225]  435835  512103  716066  487486  484298  479765  444743  483596
## [233]  525878  423250  538443  493034  434292  496382  432154  528616
## [241]  515025  433491  421223  428350  446358  455995  659273  435604
## [249]  425092  452924  541737  444822  423024  445940  468706  428584
## [257]  972354  459189  498216  608184  444219  433788  586657 1226583
## [265]  664670  447346  504725  427055  561334  499001  791084  917220
## [273]  430084  508548  511289  416577  512992  431745  427862  637080
## [281]  431861  671292  442612  494638  431307  459007  517000  421446
## [289]  548361  648223  522881  433669  461678  416059  473836  745768
## [297]  523067  508891  486332  418176  417419  464945  454508  476653
## [305]  488706  647882  569761  585203  539563 1038553  567788  732569
## [313]  416165  721161  509629  474136  450924  477697  423711  419658
## [321]  553473  496414  421967  453067  466458  421561  483530  560804
## [329]  447079  528616  485496  425528  502316  467799  469921  444134
## [337]  443179  497300  426431  607848  501172  441700  483822  420973
## [345]  514033  470663  472604  487411  558183  416829  430005  426263
## [353]  439608  456236  420779  541282  518030  459248  548580  526528
## [361]  447739  586657  433375  581071  437727  575442  554986  592930
## [369]  632834  423052  504951  484861  449576  496538  459463  505438
## [377]  479482  467108  467108  849857  426562  558944  420054  691903
## [385]  419691  684015  423605  461678  466498  530099  554317  420054
## [393]  450920  427952  695136  698418  464103  526968  450695  548303
## [401]  529216  526164  506436  439919  734193  737315  544686  468713
## [409]  548361  556652  691830  520775  442429  433669  607799  660870
## [417]  440456  471990  483822  423222  500509  487742  498785  423064
## [425]  532379  426895  493862  424855  469602  432555  424468  428271
## [433]  464502  446140  480717  529104  456110  451744  680390  438711
## [441]  483450  419053  857532  454063 1484705  424034  421837  425447
## [449]  456956  434467  755858  523484  436861  654141  469864  424034
## [457]  458549  930948  664366  420629  456236  515629  606111  463667
## [465]  431637  509364  634226  458558  483261  420749  446358  428405
## [473]  451996  423297  568490  447882  450246  456236  448626 1268339
## [481]  467579  455995  698363  617860  615893  427382  565313  591711
## [489]  520231  461337  419554  460408  454915  448337  536725  472070
## [497]  430175  446771  485117  500002  462294  443508  418020  435638
## [505]  420277  511517  438139  462255 1366120  495061  420351  431245
## [513]  434894  441210  419394  593246  449432  473133  440138  462838
## [521]  423222  529223  456618  651396  451951  431861  517036  436361
## [529]  497788  529216  441637  526734  543042  428299  427744  501144
## [537]  417668  631947  489085  436798  443855  438427  437890  540712
## [545]  549174  460437  806552  604537  487085  436341  473748  484024
## [553] 1455435  445382  659504  416745  439263  556688  750972  424884
## [561]  607848  454915  419895  548256  493363  463194  450695  422149
## [569]  552354  469056  435503  561489  455361  578377  509500  889965
## [577]  462180  506329  428499  507086  419732  659558  440129  609935
## [585]  521400  608184  425804  415913  513660  424478  422960  445728
## [593]  467108  615367  557236  562336  427474  493443  443546  430554
## [601]  434097  520078  460408  454934  474617  485117  456618  660461
## [609]  423222  442035  533147  497253  617898  449354  419722  440607
## [617]  442045  450544  953588  425622  609789  598995  421633  609789
## [625]  424719  482732  469697  452283  663394  417668  530454  494784
## [633]  436107  543477  452452  481096  420054  495982  556902  421412
## [641]  432052  418405  732102  548256  476334  709445  463072  469454
## [649]  423616  456604  609789  570821  438176  416356  421561  636017
## [657]  703107  544792  434463  434114  423222  418961  595088  438996
## [665]  607848  433705  462832  476334  527162  470875  416415  456572
## [673]  422836  566049  602513  509060  448026  491000  488541  520033
## [681]  554206  429346  455379  443742  520759  421837  694812  578701
## [689]  422013  462869  456618  549413  598802  511289  464103  462294
## [697]  427422  440417  439919  424494  806316  459548  541343  438839
## [705]  439592 1033222  424468  599629  571017  416577  425199  738812
## [713]  497280  447066  477209  431513  618191  544268  557853  535978
## [721]  668319  423024  491421  682947  469572  574271  456460  478829
## [729]  816750  597843  442274  595461  553405  506329  704108  481987
## [737]  460408  515712  551962  572751  745817  422933  473171  481175
## [745]  433170  476558  420986  447488  446512  497486  433330  496856
## [753] 1161363  435836  424591  425049  441542  419691  433330  444607
## [761]  459342  452808  427474  447555  422718  673764  424494  418405
## [769]  446654  434467  479621  472789  454843  456062  588484  809585
## [777]  493689  445382  482927  503454  574271  462820  478994  434268
## [785]  501671  594187  439779  509462  435469  548664  422813  498079
## [793]  431515  447488  466502  558490  456661  509048  419146  468713
## [801]  653574  706026  511068  427965  452640  475324  470203  513416
## [809]  421561  417941  535978  422249  442274  721712  615367  472580
## [817]  549174  437825 1097453  423222  461715  471452  426836  442131
## [825]  477867  461929  478380  479611  419146  472807  515797  475322
## [833]  510072  570562  491000  419134  423024  473133 1085515  500720
## [841]  421633  511668  455361  521665  478457  548361  591711  518530
## [849]  594187  417668  452406  499197  434430  509866  504871  695411
## [857]  420986  442359  462966  761006  484669  423616  467611  440647
## [865]  506830  574005  478205  604045  465974  415913  605502  589809
## [873]  426467  487347  588003  509629  431426  429897  709798  561334
## [881]  481987  570002  443546 1125613  454915  440706  532845  498328
## [889]  604380  583755  437909  420691  510072  557349  501172  609789
## [897]  476599  424094  557644  706180  425785  606752  417668  673764
## [905]  460214  475324  547886  554206  430035  456236  419740  462832
## [913]  440129  584790  425804  481987  799281  657397  496526  426431
## [921]  440969  487330  444554  512771  466325  440969  512828  422275
## [929]  531055  437666  472166  653574  417605  502837  444304  436798
## [937]  745768  478346  857532  715938  747719  569930  423217  433989
## [945]  475322  585361  452402  425497  502752  492263  543922  766115
## [953]  461337  421561  456922  584259  493034  538822  542265  430283
## [961]  498349  431245  491862  420895  448337  418702  477505  421467
## [969]  469454  749636  433906  437727  668362  449101  981628  470368
## [977]  746432  451059  499935  473625  566537  456367  455553  693066
## [985]  539864  447346  478315  427686  435842  485710  436163  514716
## 
## [[3]][[2]]
## [1] 992
## 
## 
## [[4]]
## NULL
## 
## [[5]]
## [[5]][[1]]
##    [1] 4 3 4 4 2 4 3 4 2 1 4 4 3 3 3 4 2 2 2 3 3 2 4 4 4 3 4 4 3 3 4 3 2 1
##   [35] 4 4 4 4 2 2 3 3 4 3 4 3 4 4 3 2 4 4 4 4 3 4 4 4 4 4 4 2 4 4 4 4 3 3
##   [69] 4 3 4 4 4 4 4 4 4 4 3 4 3 4 4 2 2 3 3 4 3 2 4 4 4 3 3 2 2 4 3 4 1 4
##  [103] 1 4 4 4 3 3 4 3 4 4 4 2 4 3 4 3 3 3 1 4 4 4 4 4 1 4 4 4 3 3 4 4 4 4
##  [137] 4 3 4 4 3 2 4 4 4 1 3 4 4 4 4 2 2 4 4 4 2 4 4 3 4 4 4 4 2 4 4 4 3 4
##  [171] 3 3 3 4 2 4 4 2 4 4 4 3 4 4 4 3 4 3 4 3 4 3 4 2 3 3 4 4 3 3 4 2 4 3
##  [205] 2 2 4 4 2 2 4 4 2 2 3 3 3 4 3 4 4 4 4 4 1 4 3 4 4 4 4 3 4 4 4 1 4 4
##  [239] 4 4 4 4 4 4 1 3 4 1 4 4 2 4 2 4 4 4 3 3 3 4 4 4 4 3 2 2 4 4 3 4 4 2
##  [273] 4 1 4 4 4 4 4 4 4 4 3 1 1 1 4 4 4 2 4 3 3 3 4 2 4 4 4 3 2 4 4 4 2 4
##  [307] 1 4 4 4 4 3 2 2 4 4 4 3 3 3 2 2 4 3 4 3 4 4 4 4 3 4 3 4 4 3 4 4 4 3
##  [341] 4 4 3 3 4 3 4 2 3 2 4 3 2 3 4 4 4 2 4 4 4 4 3 3 4 4 2 4 3 1 3 2 4 3
##  [375] 3 4 3 3 4 4 2 4 3 2 3 4 3 4 4 3 3 2 4 4 4 3 4 3 4 1 4 4 2 2 4 3 1 4
##  [409] 3 3 4 3 4 4 4 3 3 3 4 3 1 4 2 2 4 3 3 3 2 4 4 4 3 4 4 2 3 4 4 3 3 4
##  [443] 3 4 4 4 4 4 4 4 3 2 4 3 4 4 3 2 4 2 4 4 4 3 4 3 4 4 4 2 4 4 3 3 4 3
##  [477] 1 3 2 3 2 4 4 4 3 4 2 2 4 2 2 3 4 2 3 4 3 3 4 4 4 3 2 3 3 3 4 4 4 4
##  [511] 2 3 4 3 2 3 3 3 4 3 4 3 4 4 4 3 4 3 2 4 4 3 3 4 3 4 3 4 3 3 3 2 3 3
##  [545] 4 4 1 4 3 4 3 2 4 2 4 3 3 4 3 3 4 2 4 4 4 2 4 4 4 4 4 4 4 4 4 3 2 4
##  [579] 2 4 4 3 4 4 4 4 4 3 3 4 2 4 4 3 1 3 4 4 1 3 4 4 4 4 3 4 2 4 4 4 4 2
##  [613] 4 3 4 4 4 4 3 4 4 3 2 3 4 2 4 4 4 3 4 3 4 4 4 4 3 4 3 3 4 2 2 3 4 4
##  [647] 3 4 4 3 4 3 3 4 4 4 4 4 4 3 3 4 3 2 1 4 4 3 4 3 4 3 3 4 3 4 2 2 4 4
##  [681] 2 4 3 2 4 3 4 2 4 3 2 4 3 4 2 2 3 2 3 4 4 4 4 4 4 4 4 3 4 4 3 4 2 4
##  [715] 4 4 4 4 4 4 2 4 4 4 4 3 4 3 4 3 1 4 4 3 2 4 3 3 4 4 3 3 4 4 4 3 2 4
##  [749] 4 2 3 4 4 4 4 4 3 4 4 3 4 1 4 1 4 4 4 2 4 3 4 4 2 4 1 3 3 3 4 1 3 4
##  [783] 4 3 2 4 2 4 4 3 4 3 4 4 1 4 2 3 3 3 2 4 3 4 4 4 4 2 1 2 4 3 4 4 4 3
##  [817] 4 3 3 1 4 3 3 2 4 3 3 2 4 3 4 3 4 4 4 4 3 4 4 4 4 4 4 3 2 4 2 3 3 3
##  [851] 4 4 4 4 3 3 4 4 4 3 3 2 4 4 4 4 1 4 2 4 4 4 4 3 4 4 4 2 4 4 4 4 1 4
##  [885] 1 4 4 4 4 4 2 4 1 4 1 4 4 4 4 3 4 1 4 4 4 4 3 4 3 3 3 4 3 3 2 3 4 4
##  [919] 4 1 4 2 4 4 4 4 3 4 3 4 4 3 1 4 4 4 3 4 2 4 4 3 4 3 4 4 3 2 4 4 4 1
##  [953] 4 4 1 4 4 4 4 4 3 2 3 4 3 3 2 3 3 4 4 4 2 4 4 2 4 3 1 4 4 2 4 1 4 4
##  [987] 3 3 3 3 3 4 3 4 3 3 2 4 3 4 4 4 4 4 4 3 4 3 3 4 3 4 3 2 4 4 4 3 4 3
## [1021] 4 3 2 2 4 2 4 4 4 4 2 4 2 3 3 2 3 4 1 4 3 3 3 4 3 4 2 4 4 3 3 4 2 3
## [1055] 3 4 3 4 3 3 4 2 3 4 4 3 4 3 4 4 4 4 4 4 4 3 4 4 4 4 3 3 4 2 3 4 3 3
## [1089] 2 2 2 2 4 4 3 2 4 4 4 3 2 2 3 4 3 2 4 2 4 4 3 4 4 4 3 4 4 4 3 3 4 3
## [1123] 3 3 4 3 3 4 2 3 4 4 2 4 2 2 2 4 3 4 4 3 3 2 2 4 2 4 3 3 2 4 3 2 4 3
## [1157] 3 4 4 4 4 4 4 2 1 4 2 2 4 4 2 4 4 1 2 4 4 4 3 3 3 1 4 2 3 4 1 4 4 2
## [1191] 3 2 4 4 1 4 4 4
## 
## [[5]][[2]]
## [1] 1198
## 
## 
## [[6]]
## NULL
fnlwgttrainout <- tail(order(rank(newtrain[, 3])), 15)
fnlout <- c()
for(i in 1:length(fnlwgttrainout)){
  fnlout[i] <- newtrain[fnlwgttrainout[i], 3]
}

#head(order(rank(newtrain[,5])))
table(newtrain[, 11])
## 
##     0   114   401   594   914   991  1055  1086  1111  1151  1173  1409 
## 29849     6     2    34     8     5    25     4     1     8     3     7 
##  1424  1455  1471  1506  1639  1797  1831  1848  2009  2036  2050  2062 
##     3     1     7    15     1     7     7     6     3     4     5     2 
##  2105  2174  2176  2202  2228  2290  2329  2346  2354  2387  2407  2414 
##     9    48    23    16     5     5     6     6    11     1    19     8 
##  2463  2538  2580  2597  2635  2653  2829  2885  2907  2936  2961  2964 
##    11     1    12    20    11     5    31    24    11     3     3     9 
##  2977  2993  3103  3137  3273  3325  3411  3418  3432  3456  3464  3471 
##     8     2    97    37     6    53    24     5     4     2    23     8 
##  3674  3781  3818  3887  3908  3942  4064  4101  4386  4416  4508  4650 
##    14    12     7     6    32    14    42    20    70    12    12    41 
##  4687  4787  4865  4931  4934  5013  5060  5178  5455  5556  5721  6097 
##     3    23    17     1     7    69     1    97    11     5     3     1 
##  6360  6418  6497  6514  6723  6767  6849  7298  7430  7443  7688  7896 
##     3     9    11     5     2     5    27   246     9     5   284     3 
##  7978  8614  9386  9562 10520 10566 10605 11678 13550 14084 14344 15020 
##     1    55    22     4    43     6    12     2    27    41    26     5 
## 15024 15831 18481 20051 22040 25124 25236 27828 34095 41310 99999 
##   347     6     2    37     1     4    11    34     5     2   159
gainout <- tail(order(rank(newtrain[, 11])), 159)



#Outliers removing for training sets.
dim(newtrain)
## [1] 32561    15
newtrain <- newtrain[-gainout, ]
dim(newtrain)
## [1] 32402    15
#Deal with outliers for testing sets
for(i in continuouscol){
  boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i])),
          xlab = colnames(newtest[i]))
}

for(i in continuouscol){
  den_acc <- density(newtest[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtest[i])))
  polygon(den_acc, col = "red", border = "blue")
}

outlierstest <- list()
for(i in continuouscol){
  outliers <- boxplot.stats(newtest[, i])$out
  numbers <- length(outliers)
  outlierstest[[i]] <- list(outliers, numbers)
}
head(outlierstest)
## [[1]]
## [[1]][[1]]
##  [1] 79 80 90 79 80 81 82 83 81 85 80 90 81 84 81 89 81 83 81 82 80 90 81
## [24] 83 80 90 90 84 80 80 80 81 90 85 90 81 81 80 80 79 81 80 88 87 90 79
## [47] 83 79 80 90 79 79 81 81 90 82 90 87 81 88 80 81 80 81 90 88 89 84 80
## [70] 80 83 79 81
## 
## [[1]][[2]]
## [1] 73
## 
## 
## [[2]]
## NULL
## 
## [[3]]
## [[3]][[1]]
##   [1]  444554  432824  465326  445382  479296  428420  456736  537222
##   [9]  513100  447488  512864  500068  446894  599057  479179  471990
##  [17]  457162  455379  542610  479600  448026  437200  652784  573446
##  [25]  453233  662460  426589  629900  499971  450770  481987  478373
##  [33]  486194  509364  632733  504725  560313  651702  644278  535852
##  [41]  445758  452353  475775  455469  522241  427744  473206  427541
##  [49]  581128  444725  608881  490871  430151  431245  451019  430336
##  [57]  433602  437994  436431  914061  624006  510072  484475  505365
##  [65]  593246  714597  816750  491214  446724  552529  454717  425622
##  [73]  575172  475322  622192  566066  493732  427437  427320  614113
##  [81]  445365  472517  459556  548568  565769  429832  424988  426350
##  [89]  789600  424340  447144  864960  497414  471876  723746  427422
##  [97]  421837  692831  535869  433624  638116  467936  698039  427812
## [105]  472861  449101  677398  464621  547931  497039  451742  460322
## [113]  666014  474568  452640  765214  445480  761800  460356 1047822
## [121]  436651  544319  617917  450695  429696  443377  522881  437161
## [129]  421010  479296  459189  469005  457070  750972  505365  458609
## [137]  520231  589155  538193  428251  454321  455399  477345  470486
## [145]  437318  588739  449578  486436  588484  449101  528618  806552
## [153]  478354  467936  505168  858091  451327  482082  663291  447554
## [161]  451603  455995  460408  581025  453983  656488  421633  478457
## [169]  422836  557349  421350  498267  442478  421228  655066  426431
## [177]  494371  737315  541755  436198  594521  442656  491000  455995
## [185]  430672  496856  589838  479296  605504  490332  423453  445382
## [193]  558752  448862  429281  772919  884434  495288  488720  444554
## [201]  604045  437940  697806  632271  497788  464484  587310  467759
## [209]  472344  438587  427055  538243  441227  459465  454950  439777
## [217] 1490400  768659  764638  437458  517995  718736  433682  477083
## [225]  442478  547108  474229  498833  882849  453663  443508  498411
## [233]  504423  746660  488459  423883  457357  501671  786418  565313
## [241]  483201  466458  424934  450200  465334  482096  451603  465725
## [249]  502633  473133  477867  435356  478457  653215  437825  576645
## [257]  510643  538099  425502  432480  482211  539019  496743  455379
## [265]  421132  452402  531055  454076  434081  452402  434710  446947
## [273]  472411  594187  685955  442116  435835  430278  548361  606111
## [281]  459192  592029  426263  513977  647591  566066  553588  433325
## [289]  491607  624572  488706  535740  607118  482677  420973  426431
## [297]  580591  449172  438427  557853  446390  487751  469263  478972
## [305]  441949  430930  635913  485944  557805  626493  444134  433580
## [313]  493034  914061  456736  557349  443336  953588  473547  457710
## [321]  471768  558344  421871  430710  481258  590204  679853  421474
## [329]  443809  516701  443546  535762  438321  814850  427812  874728
## [337]  497525  434102  450141  441949  438429  506830  478277  594194
## [345]  445480  452963  498267  538583  602513  589809  421474  507492
## [353]  546118  446647  530099  453686  443377 1117718  427248  461725
## [361]  460259  849067  590941  572285  608441  720428  423311  436361
## [369]  463601  557359  454024  431515  590522  443546  433592  479406
## [377]  430195  421633  428299  484911  478836  513440  744929  534775
## [385]  511231  598995  456592  525848  442359  458168  457453  913447
## [393]  584259  694105  441227  448841  606347  437566  495366 1024535
## [401]  427474  811615  431551  461929  533660  445382  427475 1210504
## [409]  426263  425830  421837  427770  447210  455995  435836  425816
## [417]  490645  513977  553405  497414  742903  431745  553405  504941
## [425]  450141  456665  449376  487770  448026  443858  473449  440934
## [433]  456430  421200  426589  484879  438696  435638  535027  464552
## [441]  443701  438427  513719  439263  425444  454585  428251  618130
## [449]  542762  771836  473133  464552  435266  437161  462964  423605
## [457]  618808  573446  432204  461484  455379  504871  532969  455665
## [465]  425127  449925  427515  607658  422933  430340  440129
## 
## [[3]][[2]]
## [1] 471
## 
## 
## [[4]]
## NULL
## 
## [[5]]
## [[5]][[1]]
##   [1] 4 4 3 4 4 4 4 4 4 3 2 3 4 4 2 4 4 3 3 2 4 3 3 4 3 3 4 4 4 1 1 4 3 2 4
##  [36] 4 2 3 4 4 1 4 1 4 4 4 3 4 4 3 4 3 4 2 4 2 4 4 4 3 4 2 4 4 3 3 1 1 4 3
##  [71] 4 2 3 4 3 3 3 4 4 4 4 4 3 3 3 2 2 4 4 4 4 3 3 4 3 3 3 3 1 2 3 3 3 1 4
## [106] 4 4 4 4 4 4 4 2 3 4 4 3 4 4 4 3 3 3 4 4 1 4 4 4 3 4 2 4 2 4 4 4 4 3 3
## [141] 4 4 1 4 3 4 4 4 3 4 4 4 3 3 3 4 2 2 4 2 4 4 4 4 4 4 4 4 4 2 4 4 3 4 1
## [176] 2 3 4 3 2 4 1 4 2 3 3 4 4 4 1 2 2 4 3 4 4 4 4 3 2 4 4 4 4 3 3 3 4 3 4
## [211] 2 4 4 4 3 4 3 2 4 4 3 4 2 2 4 1 2 3 4 2 4 4 4 4 4 2 4 4 4 3 4 3 4 3 4
## [246] 3 4 3 4 3 4 4 4 4 3 3 3 2 3 4 3 4 4 4 3 1 2 2 2 2 3 1 2 3 4 4 4 1 1 2
## [281] 4 4 4 4 2 4 3 4 3 1 3 3 1 3 4 4 4 4 4 4 3 3 3 3 3 3 4 4 4 4 3 4 4 3 2
## [316] 4 4 2 4 4 3 4 3 4 4 4 4 4 2 3 4 4 3 2 4 2 4 4 4 4 2 3 4 4 3 3 4 3 2 3
## [351] 4 2 3 4 4 3 4 4 2 4 4 3 2 4 4 4 2 4 4 4 3 4 3 3 4 2 4 2 3 3 3 4 3 4 3
## [386] 4 1 4 3 4 4 3 4 2 4 2 3 3 4 3 2 1 1 2 3 3 4 3 1 3 3 2 4 3 4 3 3 3 4 3
## [421] 4 4 2 3 3 3 3 1 3 3 2 4 3 4 1 2 3 4 4 4 4 4 4 3 3 2 3 4 4 3 4 2 4 4 4
## [456] 4 4 2 4 2 4 2 4 4 3 4 3 2 4 3 4 4 3 4 4 4 4 4 3 4 4 3 4 3 4 4 3 2 4 2
## [491] 2 4 2 4 3 4 4 3 4 3 4 3 4 1 1 4 3 2 4 4 4 4 3 3 4 4 2 4 4 4 3 4 3 1 4
## [526] 3 3 4 3 4 4 4 4 4 4 4 4 4 3 2 3 4 3 4 4 4 4 4 3 4 4 3 4 3 4 2 2 3 2 3
## [561] 3 3 4 4 4 1 3 3 3 4 4 1 3 4 2 3 3 3 2 3 3 4 4 4 3 4 4 1 4 4 4 4 4 4 4
## [596] 4
## 
## [[5]][[2]]
## [1] 596
## 
## 
## [[6]]
## NULL
table(newtest[, 11])
## 
##     0   114   401   594   914   991  1055  1086  1151  1173  1264  1409 
## 14958     2     3    18     2     1    12     4     5     2     2     3 
##  1424  1455  1471  1506  1731  1797  1831  1848  2036  2062  2105  2174 
##     1     3     2     9     1     3     2     3     1     1     6    26 
##  2176  2202  2290  2329  2346  2354  2407  2414  2463  2538  2580  2597 
##     8    12     5     1     2    10     6     2     4     4     8    11 
##  2635  2653  2829  2885  2907  2936  2961  2964  2977  2993  3103  3137 
##     3     6    11     6     7     1     1     5     3     1    55    14 
##  3273  3325  3411  3418  3456  3464  3471  3674  3781  3818  3887  3908 
##     1    28    10     3     4    10     3     8     4     4     2    10 
##  3942  4064  4101  4386  4416  4508  4650  4687  4787  4865  4931  4934 
##     4    12     9    38    12    11    22     1    12     8     3     3 
##  5013  5060  5178  5455  5556  5721  6097  6418  6497  6514  6612  6723 
##    48     1    49     7     1     4     1     7     4     5     1     3 
##  6767  6849  7262  7298  7430  7443  7688  7896  7978  8614  9386  9562 
##     1    15     1   118     6     2   126     1     1    27     9     1 
## 10520 10566 10605 11678 13550 14084 14344 15020 15024 15831 20051 25124 
##    21     2     7     2    15     8     8     5   166     2    12     2 
## 25236 27828 34095 41310 99999 
##     3    24     1     1    85
gainout <- tail(order(rank(newtest[, 11])), 85)



#Outliers removing for training sets.
dim(newtest)
## [1] 16281    15
newtest <- newtest[-gainout, ]
dim(newtest)
## [1] 16196    15
#Plots after removing outliers training
for(i in continuouscol){
  boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i]), "-outliers removed"),
          xlab = colnames(newtrain[i]))
}

for(i in continuouscol){
  den_acc <- density(newtrain[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtrain[i]), "-outliers removed"))
  polygon(den_acc, col = "red", border = "blue")
}

#Plots after removing outliers testing
for(i in continuouscol){
  boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i]), "-outliers removed"),
          xlab = colnames(newtest[i]))
}

for(i in continuouscol){
  den_acc <- density(newtest[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtest[i]), "-outliers removed"))
  polygon(den_acc, col = "red", border = "blue")
}

\(\\\)

\(\\\)

Check-ups before discretizing and dummifying

detach("package:plyr", unload=TRUE) #because plyr and dplyr existed together conflicting...
## Warning: 'plyr' namespace cannot be unloaded:
##   namespace 'plyr' is imported by 'ggplot2', 'scales', 'reshape2', 'caret', 'pROC' so cannot be unloaded
#Check whether categorical variables can be discretized....
plot(newtrain$workclass)

table(newtrain$workclass)
## 
##      Federal-gov        Local-gov     Never-worked          Private 
##              983             2187                9            23984 
##     Self-emp-inc Self-emp-not-inc        State-gov      Without-pay 
##             1127             2747             1351               14
newtrain %>% group_by(workclass) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
##          workclass     n         freq
##             <fctr> <int>        <dbl>
## 1      Federal-gov   983 0.0303376335
## 2        Local-gov  2187 0.0674958336
## 3     Never-worked     9 0.0002777606
## 4          Private 23984 0.7402012221
## 5     Self-emp-inc  1127 0.0347818036
## 6 Self-emp-not-inc  2747 0.0847787174
## 7        State-gov  1351 0.0416949571
## 8      Without-pay    14 0.0004320721
plot(newtest$workclass)

table(newtest$workclass)
## 
##      Federal-gov        Local-gov     Never-worked          Private 
##              480             1089                3            11919 
##     Self-emp-inc Self-emp-not-inc        State-gov      Without-pay 
##              570             1421              707                7
newtest %>% group_by(workclass) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
##          workclass     n         freq
##             <fctr> <int>        <dbl>
## 1      Federal-gov   480 0.0296369474
## 2        Local-gov  1089 0.0672388244
## 3     Never-worked     3 0.0001852309
## 4          Private 11919 0.7359224500
## 5     Self-emp-inc   570 0.0351938750
## 6 Self-emp-not-inc  1421 0.0877377130
## 7        State-gov   707 0.0436527538
## 8      Without-pay     7 0.0004322055
plot(newtrain$education)

table(newtrain$education)
## 
##         10th         11th         12th      1st-4th      5th-6th 
##          931         1175          433          168          333 
##      7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors 
##          646          513         1066         1381         5314 
##    Doctorate      HS-grad      Masters    Preschool  Prof-school 
##          401        10478         1705           51          530 
## Some-college 
##         7277
newtrain %>% group_by(education) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##       education     n        freq
##          <fctr> <int>       <dbl>
##  1         10th   931 0.028732794
##  2         11th  1175 0.036263194
##  3         12th   433 0.013363373
##  4      1st-4th   168 0.005184865
##  5      5th-6th   333 0.010277143
##  6      7th-8th   646 0.019937041
##  7          9th   513 0.015832356
##  8   Assoc-acdm  1066 0.032899204
##  9    Assoc-voc  1381 0.042620826
## 10    Bachelors  5314 0.164002222
## 11    Doctorate   401 0.012375779
## 12      HS-grad 10478 0.323375100
## 13      Masters  1705 0.052620209
## 14    Preschool    51 0.001573977
## 15  Prof-school   530 0.016357015
## 16 Some-college  7277 0.224584902
plot(newtest$education)

table(newtest$education)
## 
##         10th         11th         12th      1st-4th      5th-6th 
##          456          637          224           79          175 
##      7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors 
##          309          242          534          677         2648 
##    Doctorate      HS-grad      Masters    Preschool  Prof-school 
##          170         5272          922           32          236 
## Some-college 
##         3583
newtest %>% group_by(education) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##       education     n        freq
##          <fctr> <int>       <dbl>
##  1         10th   456 0.028155100
##  2         11th   637 0.039330699
##  3         12th   224 0.013830575
##  4      1st-4th    79 0.004877748
##  5      5th-6th   175 0.010805137
##  6      7th-8th   309 0.019078785
##  7          9th   242 0.014941961
##  8   Assoc-acdm   534 0.032971104
##  9    Assoc-voc   677 0.041800445
## 10    Bachelors  2648 0.163497160
## 11    Doctorate   170 0.010496419
## 12      HS-grad  5272 0.325512472
## 13      Masters   922 0.056927636
## 14    Preschool    32 0.001975796
## 15  Prof-school   236 0.014571499
## 16 Some-college  3583 0.221227464
plot(newtrain$marital.status)

table(newtrain$marital.status)
## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  4432                    23                 14844 
## Married-spouse-absent         Never-married             Separated 
##                   417                 10671                  1023 
##               Widowed 
##                   992
newtrain %>% group_by(marital.status) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
##          marital.status     n         freq
##                  <fctr> <int>        <dbl>
## 1              Divorced  4432 0.1367816801
## 2     Married-AF-spouse    23 0.0007098327
## 3    Married-civ-spouse 14844 0.4581198691
## 4 Married-spouse-absent   417 0.0128695760
## 5         Never-married 10671 0.3293315227
## 6             Separated  1023 0.0315721252
## 7               Widowed   992 0.0306153941
plot(newtest$marital.status)

table(newtest$marital.status)
## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  2181                    13                  7340 
## Married-spouse-absent         Never-married             Separated 
##                   210                  5425                   503 
##               Widowed 
##                   524
newtest %>% group_by(marital.status) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
##          marital.status     n         freq
##                  <fctr> <int>        <dbl>
## 1              Divorced  2181 0.1346628797
## 2     Married-AF-spouse    13 0.0008026673
## 3    Married-civ-spouse  7340 0.4531983206
## 4 Married-spouse-absent   210 0.0129661645
## 5         Never-married  5425 0.3349592492
## 6             Separated   503 0.0310570511
## 7               Widowed   524 0.0323536676
plot(newtrain$occupation)

table(newtrain$occupation)
## 
##      Adm-clerical      Armed-Forces      Craft-repair   Exec-managerial 
##              3986                 9              4154              4085 
##   Farming-fishing Handlers-cleaners Machine-op-inspct     Other-service 
##              1185              1617              2184              3694 
##   Priv-house-serv    Prof-specialty   Protective-serv             Sales 
##               206              4228               734              3690 
##      Tech-support  Transport-moving 
##               992              1638
newtrain %>% group_by(occupation) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
##           occupation     n         freq
##               <fctr> <int>        <dbl>
##  1      Adm-clerical  3986 0.1230170977
##  2      Armed-Forces     9 0.0002777606
##  3      Craft-repair  4154 0.1282019628
##  4   Exec-managerial  4085 0.1260724647
##  5   Farming-fishing  1185 0.0365718166
##  6 Handlers-cleaners  1617 0.0499043269
##  7 Machine-op-inspct  2184 0.0674032467
##  8     Other-service  3694 0.1140053083
##  9   Priv-house-serv   206 0.0063576322
## 10    Prof-specialty  4228 0.1304857725
## 11   Protective-serv   734 0.0226529227
## 12             Sales  3690 0.1138818591
## 13      Tech-support   992 0.0306153941
## 14  Transport-moving  1638 0.0505524350
plot(newtest$occupation)

table(newtest$occupation)
## 
##      Adm-clerical      Armed-Forces      Craft-repair   Exec-managerial 
##              1965                 6              2032              2009 
##   Farming-fishing Handlers-cleaners Machine-op-inspct     Other-service 
##               576               864              1085              1824 
##   Priv-house-serv    Prof-specialty   Protective-serv             Sales 
##               133              2077               367              1912 
##      Tech-support  Transport-moving 
##               548               798
newtest %>% group_by(occupation) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
##           occupation     n         freq
##               <fctr> <int>        <dbl>
##  1      Adm-clerical  1965 0.1213262534
##  2      Armed-Forces     6 0.0003704618
##  3      Craft-repair  2032 0.1254630773
##  4   Exec-managerial  2009 0.1240429736
##  5   Farming-fishing   576 0.0355643369
##  6 Handlers-cleaners   864 0.0533465053
##  7 Machine-op-inspct  1085 0.0669918498
##  8     Other-service  1824 0.1126204001
##  9   Priv-house-serv   133 0.0082119042
## 10    Prof-specialty  2077 0.1282415411
## 11   Protective-serv   367 0.0226599160
## 12             Sales  1912 0.1180538405
## 13      Tech-support   548 0.0338355149
## 14  Transport-moving   798 0.0492714250
plot(newtrain$relationship)

table(newtrain$relationship)
## 
##        Husband  Not-in-family Other-relative      Own-child      Unmarried 
##          13072           8284            981           5066           3442 
##           Wife 
##           1557
newtrain %>% group_by(relationship) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
##     relationship     n       freq
##           <fctr> <int>      <dbl>
## 1        Husband 13072 0.40343189
## 2  Not-in-family  8284 0.25566323
## 3 Other-relative   981 0.03027591
## 4      Own-child  5066 0.15634837
## 5      Unmarried  3442 0.10622801
## 6           Wife  1557 0.04805259
plot(newtest$relationship)

table(newtest$relationship)
## 
##        Husband  Not-in-family Other-relative      Own-child      Unmarried 
##           6465           4262            525           2511           1676 
##           Wife 
##            757
newtest %>% group_by(relationship) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
##     relationship     n       freq
##           <fctr> <int>      <dbl>
## 1        Husband  6465 0.39917264
## 2  Not-in-family  4262 0.26315140
## 3 Other-relative   525 0.03241541
## 4      Own-child  2511 0.15503828
## 5      Unmarried  1676 0.10348234
## 6           Wife   757 0.04673994
plot(newtrain$race)

table(newtrain$race)
## 
## Amer-Indian-Eskimo Asian-Pac-Islander              Black 
##                311               1029               3117 
##              Other              White 
##                269              27676
newtrain %>% group_by(race) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
##                 race     n        freq
##               <fctr> <int>       <dbl>
## 1 Amer-Indian-Eskimo   311 0.009598173
## 2 Asian-Pac-Islander  1029 0.031757299
## 3              Black  3117 0.096197766
## 4              Other   269 0.008301957
## 5              White 27676 0.854144806
plot(newtest$race)

table(newtest$race)
## 
## Amer-Indian-Eskimo Asian-Pac-Islander              Black 
##                159                475               1558 
##              Other              White 
##                134              13870
newtest %>% group_by(race) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
##                 race     n        freq
##               <fctr> <int>       <dbl>
## 1 Amer-Indian-Eskimo   159 0.009817239
## 2 Asian-Pac-Islander   475 0.029328229
## 3              Black  1558 0.096196592
## 4              Other   134 0.008273648
## 5              White 13870 0.856384292
plot(newtrain$sex)

table(newtrain$sex)
## 
## Female   Male 
##  10749  21653
newtrain %>% group_by(sex) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
##      sex     n      freq
##   <fctr> <int>     <dbl>
## 1 Female 10749 0.3317388
## 2   Male 21653 0.6682612
plot(newtest$sex)

table(newtest$sex)
## 
## Female   Male 
##   5407  10789
newtest %>% group_by(sex) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
##      sex     n      freq
##   <fctr> <int>     <dbl>
## 1 Female  5407 0.3338479
## 2   Male 10789 0.6661521
plot(newtrain$native.country)

table(newtrain$native.country)
## 
##                   Cambodia                     Canada 
##                         20                        120 
##                      China                   Columbia 
##                         79                         59 
##                       Cuba         Dominican-Republic 
##                         95                         70 
##                    Ecuador                El-Salvador 
##                         28                        106 
##                    England                     France 
##                         90                         29 
##                    Germany                     Greece 
##                        137                         29 
##                  Guatemala                      Haiti 
##                         64                         44 
##         Holand-Netherlands                   Honduras 
##                          1                         13 
##                       Hong                    Hungary 
##                         23                         13 
##                      India                       Iran 
##                        104                         43 
##                    Ireland                      Italy 
##                         24                         74 
##                    Jamaica                      Japan 
##                         81                         66 
##                       Laos                     Mexico 
##                         22                        656 
##                  Nicaragua Outlying-US(Guam-USVI-etc) 
##                         34                         14 
##                       Peru                Philippines 
##                         31                        210 
##                     Poland                   Portugal 
##                         60                         37 
##                Puerto-Rico                   Scotland 
##                        114                         12 
##                      South                     Taiwan 
##                         89                         56 
##                   Thailand            Trinadad&Tobago 
##                         19                         19 
##              United-States                    Vietnam 
##                      29528                         73 
##                 Yugoslavia 
##                         16
newtrain %>% group_by(native.country) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 41 x 3
##        native.country     n         freq
##                <fctr> <int>        <dbl>
##  1           Cambodia    20 0.0006172458
##  2             Canada   120 0.0037034751
##  3              China    79 0.0024381211
##  4           Columbia    59 0.0018208753
##  5               Cuba    95 0.0029319178
##  6 Dominican-Republic    70 0.0021603605
##  7            Ecuador    28 0.0008641442
##  8        El-Salvador   106 0.0032714030
##  9            England    90 0.0027776063
## 10             France    29 0.0008950065
## # ... with 31 more rows
plot(newtest$native.country)

table(newtest$native.country)
## 
##                   Cambodia                     Canada 
##                         12                         61 
##                      China                   Columbia 
##                         50                         26 
##                       Cuba         Dominican-Republic 
##                         43                         34 
##                    Ecuador                El-Salvador 
##                         17                         49 
##                    England                     France 
##                         38                          9 
##                    Germany                     Greece 
##                         69                         20 
##                  Guatemala                      Haiti 
##                         24                         31 
##                   Honduras                       Hong 
##                          7                         10 
##                    Hungary                      India 
##                          6                         56 
##                       Iran                    Ireland 
##                         16                         13 
##                      Italy                    Jamaica 
##                         32                         25 
##                      Japan                       Laos 
##                         32                          5 
##                     Mexico                  Nicaragua 
##                        310                         15 
## Outlying-US(Guam-USVI-etc)                       Peru 
##                          9                         15 
##                Philippines                     Poland 
##                        109                         27 
##                   Portugal                Puerto-Rico 
##                         30                         70 
##                   Scotland                      South 
##                          9                         37 
##                     Taiwan                   Thailand 
##                         17                         13 
##            Trinadad&Tobago              United-States 
##                          8                      14813 
##                    Vietnam                 Yugoslavia 
##                         22                          7
newtest %>% group_by(native.country) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 40 x 3
##        native.country     n         freq
##                <fctr> <int>        <dbl>
##  1           Cambodia    12 0.0007409237
##  2             Canada    61 0.0037663621
##  3              China    50 0.0030871820
##  4           Columbia    26 0.0016053347
##  5               Cuba    43 0.0026549765
##  6 Dominican-Republic    34 0.0020992838
##  7            Ecuador    17 0.0010496419
##  8        El-Salvador    49 0.0030254384
##  9            England    38 0.0023462583
## 10             France     9 0.0005556928
## # ... with 30 more rows
#Check collinearity issues
newtrain %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##       education     n        freq
##          <fctr> <int>       <dbl>
##  1         10th   931 0.028732794
##  2         11th  1175 0.036263194
##  3         12th   433 0.013363373
##  4      1st-4th   168 0.005184865
##  5      5th-6th   333 0.010277143
##  6      7th-8th   646 0.019937041
##  7          9th   513 0.015832356
##  8   Assoc-acdm  1066 0.032899204
##  9    Assoc-voc  1381 0.042620826
## 10    Bachelors  5314 0.164002222
## 11    Doctorate   401 0.012375779
## 12      HS-grad 10478 0.323375100
## 13      Masters  1705 0.052620209
## 14    Preschool    51 0.001573977
## 15  Prof-school   530 0.016357015
## 16 Some-college  7277 0.224584902
newtrain %>% group_by(education.num) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##    education.num     n        freq
##            <int> <int>       <dbl>
##  1             1    51 0.001573977
##  2             2   168 0.005184865
##  3             3   333 0.010277143
##  4             4   646 0.019937041
##  5             5   513 0.015832356
##  6             6   931 0.028732794
##  7             7  1175 0.036263194
##  8             8   433 0.013363373
##  9             9 10478 0.323375100
## 10            10  7277 0.224584902
## 11            11  1381 0.042620826
## 12            12  1066 0.032899204
## 13            13  5314 0.164002222
## 14            14  1705 0.052620209
## 15            15   530 0.016357015
## 16            16   401 0.012375779
newtrain <- newtrain[, -4]
newtest <- newtest[, -4]

\(\\\)

\(\\\)

c) 6 - 8 EDAs

#Find correlations of the data - for collinearity issue checks
cor(newtest[, c(1, 3, 4, 10, 12)])
##                        age        fnlwgt education.num  capital.gain
## age             1.00000000 -0.0759176992    0.01555523  0.1080390077
## fnlwgt         -0.07591770  1.0000000000   -0.02926279 -0.0007549241
## education.num   0.01555523 -0.0292627902    1.00000000  0.1417220957
## capital.gain    0.10803901 -0.0007549241    0.14172210  1.0000000000
## hours.per.week  0.07425722 -0.0026773627    0.12954445  0.0833160656
##                hours.per.week
## age               0.074257217
## fnlwgt           -0.002677363
## education.num     0.129544454
## capital.gain      0.083316066
## hours.per.week    1.000000000
cor(newtrain[, c(1, 3, 4, 10, 12)])
##                        age       fnlwgt education.num capital.gain
## age             1.00000000 -0.076917052    0.03330048  0.116518227
## fnlwgt         -0.07691705  1.000000000   -0.04362125 -0.004506565
## education.num   0.03330048 -0.043621248    1.00000000  0.145735884
## capital.gain    0.11651823 -0.004506565    0.14573588  1.000000000
## hours.per.week  0.06774934 -0.019547738    0.14384089  0.082952143
##                hours.per.week
## age                0.06774934
## fnlwgt            -0.01954774
## education.num      0.14384089
## capital.gain       0.08295214
## hours.per.week     1.00000000
#remove fnlwght variable.
newtrain <- newtrain[, -3]
newtest <- newtest[, -3]



#See structure and summaries after removing outliers
str(newtest)
## 'data.frame':    16196 obs. of  13 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
##  $ income        : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
##       age                   workclass     education.num  
##  Min.   :17.00   Private         :11919   Min.   : 1.00  
##  1st Qu.:28.00   Self-emp-not-inc: 1421   1st Qu.: 9.00  
##  Median :37.00   Local-gov       : 1089   Median :10.00  
##  Mean   :38.72   State-gov       :  707   Mean   :10.06  
##  3rd Qu.:48.00   Self-emp-inc    :  570   3rd Qu.:12.00  
##  Max.   :90.00   Federal-gov     :  480   Max.   :16.00  
##                  (Other)         :   10                  
##                marital.status           occupation           relationship 
##  Divorced             :2181   Prof-specialty :2077   Husband       :6465  
##  Married-AF-spouse    :  13   Craft-repair   :2032   Not-in-family :4262  
##  Married-civ-spouse   :7340   Exec-managerial:2009   Other-relative: 525  
##  Married-spouse-absent: 210   Adm-clerical   :1965   Own-child     :2511  
##  Never-married        :5425   Sales          :1912   Unmarried     :1676  
##  Separated            : 503   Other-service  :1824   Wife          : 757  
##  Widowed              : 524   (Other)        :4377                        
##                  race           sex         capital.gain    
##  Amer-Indian-Eskimo:  159   Female: 5407   Min.   :    0.0  
##  Asian-Pac-Islander:  475   Male  :10789   1st Qu.:    0.0  
##  Black             : 1558                  Median :    0.0  
##  Other             :  134                  Mean   :  562.8  
##  White             :13870                  3rd Qu.:    0.0  
##                                            Max.   :41310.0  
##                                                             
##   capital.loss     hours.per.week        native.country     income     
##  Min.   :   0.00   Min.   : 1.00   United-States:14813   <=50K.:12435  
##  1st Qu.:   0.00   1st Qu.:40.00   Mexico       :  310   >50K. : 3761  
##  Median :   0.00   Median :40.00   Philippines  :  109                 
##  Mean   :  88.36   Mean   :40.33   Puerto-Rico  :   70                 
##  3rd Qu.:   0.00   3rd Qu.:45.00   Germany      :   69                 
##  Max.   :3770.00   Max.   :99.00   Canada       :   61                 
##                                    (Other)      :  764
str(newtrain)
## 'data.frame':    32402 obs. of  13 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
##       age                   workclass     education.num  
##  Min.   :17.00   Private         :23984   Min.   : 1.00  
##  1st Qu.:28.00   Self-emp-not-inc: 2747   1st Qu.: 9.00  
##  Median :37.00   Local-gov       : 2187   Median :10.00  
##  Mean   :38.54   State-gov       : 1351   Mean   :10.07  
##  3rd Qu.:48.00   Self-emp-inc    : 1127   3rd Qu.:12.00  
##  Max.   :90.00   Federal-gov     :  983   Max.   :16.00  
##                  (Other)         :   23                  
##                marital.status            occupation  
##  Divorced             : 4432   Prof-specialty :4228  
##  Married-AF-spouse    :   23   Craft-repair   :4154  
##  Married-civ-spouse   :14844   Exec-managerial:4085  
##  Married-spouse-absent:  417   Adm-clerical   :3986  
##  Never-married        :10671   Other-service  :3694  
##  Separated            : 1023   Sales          :3690  
##  Widowed              :  992   (Other)        :8565  
##          relationship                   race           sex       
##  Husband       :13072   Amer-Indian-Eskimo:  311   Female:10749  
##  Not-in-family : 8284   Asian-Pac-Islander: 1029   Male  :21653  
##  Other-relative:  981   Black             : 3117                 
##  Own-child     : 5066   Other             :  269                 
##  Unmarried     : 3442   White             :27676                 
##  Wife          : 1557                                            
##                                                                  
##   capital.gain      capital.loss     hours.per.week        native.country 
##  Min.   :    0.0   Min.   :   0.00   Min.   : 1.00   United-States:29528  
##  1st Qu.:    0.0   1st Qu.:   0.00   1st Qu.:40.00   Mexico       :  656  
##  Median :    0.0   Median :   0.00   Median :40.00   Philippines  :  210  
##  Mean   :  592.2   Mean   :  87.73   Mean   :40.39   Germany      :  137  
##  3rd Qu.:    0.0   3rd Qu.:   0.00   3rd Qu.:45.00   Canada       :  120  
##  Max.   :41310.0   Max.   :4356.00   Max.   :99.00   Puerto-Rico  :  114  
##                                                      (Other)      : 1637  
##    income     
##  <=50K:24720  
##  >50K : 7682  
##               
##               
##               
##               
## 
#Analyzing/checking before discretizing
# table(newtrain[,14])
# table(newtest[,14])
# 
# plot(newtrain$education)
# plot(newtrain$occupation)
# plot(newtrain$native.country)
# 
# plot(newtest$education)
# plot(newtest$occupation)
# plot(newtest$native.country)



#Discretize training set
# discretetrainage <- discretize(newtrain$age, method = "interval", categories = 10)
# discretetrainfnlwgt <- discretize(newtrain$fnlwgt, method = "interval", categories = 10)
# discretetrainedunum <- discretize(newtrain$education.num, method = "interval", categories = 10)
# discretetraingain <- discretize(newtrain$capital.gain, method = "interval", categories = 10)
# discretetrainloss <- discretize(newtrain$capital.loss, method = "interval", categories = 10)
# discretetrainhours <- discretize(newtrain$hours.per.week, method = "interval", categories = 10)



#Binning
countrydis <- function(vector){
  len <- length(vector)
  for(i in 1:len){
      if(vector[i] == "United-States"){
        vector[i] <- vector[i]
      }else if(vector[i] == "Mexico"){
        vector[i] <- vector[i]
      }else if(vector[i] == "Philippines"){
        vector[i] <- vector[i]
      }else{
        vector[i] <- "other_countries"
      }
  }
  return(vector)
}

workdis <- function(vector){
  len <- length(vector)
  for(i in 1:len){
    if(vector[i] == "Federal-gov"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Local-gov"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Private"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Self-emp-inc"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Self-emp-not-inc"){
      vector[i] <- vector[i]
    }else if(vector[i] == "State-gov"){
      vector[i] <- vector[i]
    }else{
      vector[i] <- "No-gain"
    }
  }
  return(vector)
}

#discretetraincountry <- as.factor(countrydis(as.character(newtrain$native.country)))



#Discretize testing set
# discretetestage <- discretize(newtest$age, method = "interval", categories = 10)
# discretetestfnlwgt <- discretize(newtest$fnlwgt, method = "interval", categories = 10)
# discretetestedunum <- discretize(newtest$education.num, method = "interval", categories = 10)
# discretetestgain <- discretize(newtest$capital.gain, method = "interval", categories = 10)
# discretetestloss <- discretize(newtest$capital.loss, method = "interval", categories = 10)
# discretetesthours <- discretize(newtest$hours.per.week, method = "interval", categories = 10)
# discretetestcountry <- as.factor(countrydis(as.character(newtest$native.country)))
#Combine training and testing to make the same intervals for discretizing



newtrain$type <- "train"
newtest$type <- "test"
combined <- rbind(newtrain, newtest)



# discreteage <- discretize(combined$age, method = "interval", categories = 10)
# discretefnlwgt <- discretize(combined$fnlwgt, method = "interval", categories = 10)
# discreteedunum <- discretize(combined$education.num, method = "interval", categories = 10)
# discretegain <- discretize(combined$capital.gain, method = "interval", categories = 7) #not enough data
# discreteloss <- discretize(combined$capital.loss, method = "interval", categories = 7) #not enough data
# discretehours <- discretize(combined$hours.per.week, method = "interval", categories = 10)
discretecountry <- as.factor(countrydis(as.character(combined$native.country)))
discreteworkclass <- as.factor(workdis(as.character(combined$workclass)))



# combined$age <- discreteage
# combined$fnlwgt <- discretefnlwgt
# combined$education.num <- discreteedunum
# combined$capital.gain <- discretegain
# combined$capital.loss <- discreteloss
# combined$hours.per.week <- discretehours
combined$native.country <- discretecountry
combined$workclass <- discreteworkclass



dim(combined)
## [1] 48598    14
newtrain2 <- combined[1:sum(combined$type == "train"), -14]
newtest2 <- combined[(sum(combined$type == "train") + 1):nrow(combined), -14]
dim(newtrain2)
## [1] 32402    13
dim(newtest2)
## [1] 16196    13
#plots
par(mfrow = c(2, 2)) #set how many plots on the palete.

for(i in 1:12){
  plot(newtrain2[, i], newtrain2[, 13])
}

for(i in 1:12){
  plot(newtest2[, i], newtest2[, 13])
}

#Assignining discretized variables
# newtrain2 <- newtrain
# newtest2 <- newtest
# dim(newtrain2)
# dim(newtest2)
# 
# newtrain2$age <- discretetrainage
# newtrain2$fnlwgt <- discretetrainfnlwgt
# newtrain2$education.num <- discretetrainedunum
# newtrain2$capital.gain <- discretetraingain
# newtrain2$capital.loss <- discretetrainloss
# newtrain2$hours.per.week <- discretetrainhours
# newtrain2$native.country <- discretetraincountry
# 
# newtest2$age <- discretetestage
# newtest2$fnlwgt <- discretetestfnlwgt
# newtest2$education.num <- discretetestedunum
# newtest2$capital.gain <- discretetestgain
# newtest2$capital.loss <- discretetestloss
# newtest2$hours.per.week <- discretetesthours
# newtest2$native.country <- discretetestcountry



#Dummify training set
dumtrainwork <- dummy(newtrain2$workclass)
dumtrainmarry <- dummy(newtrain2$marital.status)
dumtrainoccu <- dummy(newtrain2$occupation)
dumtrainrelation <- dummy(newtrain2$relationship)
dumtrainrace <- dummy(newtrain2$race)
dumtrainsex <- dummy(newtrain2$sex)
dumtraincountry <- dummy(newtrain2$native.country)



#Dummify testing set
dumtestwork <- dummy(newtest2$workclass)
dumtestmarry <- dummy(newtest2$marital.status)
dumtestoccu <- dummy(newtest2$occupation)
dumtestrelation <- dummy(newtest2$relationship)
dumtestrace <- dummy(newtest2$race)
dumtestsex <- dummy(newtest2$sex)
dumtestcountry <- dummy(newtest2$native.country)



#Take out columns
newtrain2 <- newtrain2[, -c(2, 4, 5, 6, 7, 8, 12)]
newtest2 <- newtest2[, -c(2, 4, 5, 6, 7, 8, 12)]



#Assigning dummified variables
newtrain2 <- cbind(newtrain2, dumtrainwork, dumtrainmarry, dumtrainoccu,
                   dumtrainrelation, dumtrainrace, dumtrainsex, dumtraincountry)
newtrain2[, 45] <- newtrain2$income
newtrain2 <- newtrain2[, -6]
names(newtrain2)[44]<- "income"
dim(newtrain2)
## [1] 32402    44
newtest2 <- cbind(newtest2, dumtestwork, dumtestmarry, dumtestoccu,
                   dumtestrelation, dumtestrace, dumtestsex, dumtestcountry)
newtest2[, 45] <- newtest2$income
newtest2 <- newtest2[, -6]
names(newtest2)[44]<- "income"
dim(newtest2)
## [1] 16196    44
#fixing...
newtrain2$income <- droplevels(newtrain2$income, c("<=50K.", ">50K."))
newtest2$income <- droplevels(newtest2$income, c("<=50K", ">50K"))

newtest2$income <- as.character(newtest2$income)
newtest2$income <- substr(newtest2$income, 1, nchar(newtest2$income) - 1)
newtest2$income <- as.factor(newtest2$income)



dim(newtrain2)
## [1] 32402    44
dim(newtest2)
## [1] 16196    44
str(newtrain2)
## 'data.frame':    32402 obs. of  44 variables:
##  $ age                  : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ education.num        : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ capital.gain         : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ Local-gov            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ No-gain              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : num  0 0 1 1 1 1 1 0 1 1 ...
##  $ Self-emp-inc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self-emp-not-inc     : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ State-gov            : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Married-AF-spouse    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married-civ-spouse   : num  0 1 0 1 1 1 0 1 0 1 ...
##  $ Married-spouse-absent: num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Never-married        : num  1 0 0 0 0 0 0 0 1 0 ...
##  $ Separated            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed-Forces         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft-repair         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Exec-managerial      : num  0 1 0 0 0 1 0 1 0 1 ...
##  $ Farming-fishing      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handlers-cleaners    : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ Machine-op-inspct    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other-service        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Priv-house-serv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof-specialty       : num  0 0 0 0 1 0 0 0 1 0 ...
##  $ Protective-serv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tech-support         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport-moving     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not-in-family        : num  1 0 1 0 0 0 1 0 1 0 ...
##  $ Other-relative       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own-child            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unmarried            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Wife                 : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ Asian-Pac-Islander   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : num  0 0 0 1 1 0 1 0 0 0 ...
##  $ Other                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : num  1 1 1 0 0 1 0 1 1 1 ...
##  $ Male                 : num  1 1 1 1 0 0 0 1 0 1 ...
##  $ other_countries      : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ Philippines          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ United-States        : num  1 1 1 1 0 1 0 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
str(newtest2)
## 'data.frame':    16196 obs. of  44 variables:
##  $ age                  : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ education.num        : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ capital.gain         : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ Local-gov            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ No-gain              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : num  1 1 0 1 1 1 1 0 1 1 ...
##  $ Self-emp-inc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self-emp-not-inc     : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ State-gov            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married-AF-spouse    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married-civ-spouse   : num  0 1 1 1 0 0 0 1 0 1 ...
##  $ Married-spouse-absent: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Never-married        : num  1 0 0 0 1 1 1 0 1 0 ...
##  $ Separated            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed-Forces         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft-repair         : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Exec-managerial      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Farming-fishing      : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Handlers-cleaners    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Machine-op-inspct    : num  1 0 0 1 0 0 0 0 0 0 ...
##  $ Other-service        : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ Priv-house-serv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof-specialty       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ Protective-serv      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ Sales                : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Tech-support         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport-moving     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not-in-family        : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ Other-relative       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own-child            : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ Unmarried            : num  0 0 0 0 0 0 1 0 1 0 ...
##  $ Wife                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian-Pac-Islander   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : num  1 0 0 1 0 0 1 0 0 0 ...
##  $ Other                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : num  0 1 1 0 1 1 0 1 1 1 ...
##  $ Male                 : num  1 1 1 1 0 1 1 1 0 1 ...
##  $ other_countries      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Philippines          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ United-States        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
## 'data.frame':    32402 obs. of  44 variables:
##  $ age                  : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ education.num        : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ capital.gain         : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ Local.gov            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.gain              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : int  0 0 1 1 1 1 1 0 1 1 ...
##  $ Self.emp.inc         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self.emp.not.inc     : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ State.gov            : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Married.AF.spouse    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married.civ.spouse   : int  0 1 0 1 1 1 0 1 0 1 ...
##  $ Married.spouse.absent: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Never.married        : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Separated            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed.Forces         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft.repair         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Exec.managerial      : int  0 1 0 0 0 1 0 1 0 1 ...
##  $ Farming.fishing      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handlers.cleaners    : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ Machine.op.inspct    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other.service        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Priv.house.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof.specialty       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ Protective.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tech.support         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport.moving     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not.in.family        : int  1 0 1 0 0 0 1 0 1 0 ...
##  $ Other.relative       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own.child            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unmarried            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Wife                 : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ Asian.Pac.Islander   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : int  0 0 0 1 1 0 1 0 0 0 ...
##  $ Other                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : int  1 1 1 0 0 1 0 1 1 1 ...
##  $ Male                 : int  1 1 1 1 0 0 0 1 0 1 ...
##  $ other_countries      : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ Philippines          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ United.States        : int  1 1 1 1 0 1 0 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
## 'data.frame':    16196 obs. of  44 variables:
##  $ age                  : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ education.num        : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ capital.gain         : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ Local.gov            : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ No.gain              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : int  1 1 0 1 1 1 1 0 1 1 ...
##  $ Self.emp.inc         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self.emp.not.inc     : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ State.gov            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married.AF.spouse    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married.civ.spouse   : int  0 1 1 1 0 0 0 1 0 1 ...
##  $ Married.spouse.absent: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Never.married        : int  1 0 0 0 1 1 1 0 1 0 ...
##  $ Separated            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed.Forces         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft.repair         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Exec.managerial      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Farming.fishing      : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Handlers.cleaners    : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Machine.op.inspct    : int  1 0 0 1 0 0 0 0 0 0 ...
##  $ Other.service        : int  0 0 0 0 0 1 0 0 1 0 ...
##  $ Priv.house.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof.specialty       : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Protective.serv      : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Sales                : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Tech.support         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport.moving     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not.in.family        : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Other.relative       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own.child            : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ Unmarried            : int  0 0 0 0 0 0 1 0 1 0 ...
##  $ Wife                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian.Pac.Islander   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : int  1 0 0 1 0 0 1 0 0 0 ...
##  $ Other                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : int  0 1 1 0 1 1 0 1 1 1 ...
##  $ Male                 : int  1 1 1 1 0 1 1 1 0 1 ...
##  $ other_countries      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Philippines          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ United.States        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...

Classification tree Using R part function - tuning method 1

threshold: https://stackoverflow.com/questions/46042966/set-threshold-for-the-probability-result-from-decision-tree

set.seed(100)



#Create a baseline Classification tree using gini index criterion using random cp
tree <- rpart(income ~., data = newtrain2, method = "class",
              parms = list(split = 'gini'), control = rpart.control(minsplit = 5, cp = 0.0001, maxdepth = 5))



#Visualization of the tree
rpart.plot(tree)

#Pick the optimal tuning parameter
cp <- tree$cptable[which.min(tree$cptable[, "xerror"]), "CP"]
cp   #0.0002603489
## [1] 0.0002603489
# this optimal cp is the same as the default cp that we used in rpart function



#Prune the tree using the optimal cp
treepruned <- prune(tree, cp = cp)
#Treepruned object
treepruned
## n= 32402 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 32402 7682 <=50K (0.762915869 0.237084131)  
##    2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)  
##      4) capital.gain< 7073.5 17274  849 <=50K (0.950850990 0.049149010) *
##      5) capital.gain>=7073.5 284   11 >50K (0.038732394 0.961267606)  
##       10) capital.gain>=30961.5 5    0 <=50K (1.000000000 0.000000000) *
##       11) capital.gain< 30961.5 279    6 >50K (0.021505376 0.978494624) *
##    3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)  
##      6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)  
##       12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)  
##         24) education.num< 8.5 1656  167 <=50K (0.899154589 0.100845411) *
##         25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)  
##           50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341) *
##           51) capital.loss>=1782.5 335   83 >50K (0.247761194 0.752238806) *
##       13) capital.gain>=5095.5 496   11 >50K (0.022177419 0.977822581) *
##      7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)  
##       14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)  
##         28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)  
##           56) hours.per.week< 31 306  112 <=50K (0.633986928 0.366013072) *
##           57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345) *
##         29) capital.loss>=1782.5 398   13 >50K (0.032663317 0.967336683) *
##       15) capital.gain>=5095.5 581    3 >50K (0.005163511 0.994836489) *
#Information by cp cross-validation results 
printcp(treepruned)
## 
## Classification tree:
## rpart(formula = income ~ ., data = newtrain2, method = "class", 
##     parms = list(split = "gini"), control = rpart.control(minsplit = 5, 
##         cp = 0.0001, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] capital.gain       capital.loss       education.num     
## [4] hours.per.week     Married.civ.spouse
## 
## Root node error: 7682/32402 = 0.23708
## 
## n= 32402 
## 
##           CP nsplit rel error  xerror      xstd
## 1 0.12099714      0   1.00000 1.00000 0.0099655
## 2 0.06170268      2   0.75801 0.75801 0.0089967
## 3 0.03410570      3   0.69630 0.69630 0.0086993
## 4 0.01099974      4   0.66220 0.66220 0.0085245
## 5 0.00355810      6   0.64020 0.65217 0.0084717
## 6 0.00065087      9   0.62952 0.64215 0.0084182
## 7 0.00026035     10   0.62887 0.64007 0.0084069
plotcp(treepruned)

#summary information
summary(treepruned, digits = 3)
## Call:
## rpart(formula = income ~ ., data = newtrain2, method = "class", 
##     parms = list(split = "gini"), control = rpart.control(minsplit = 5, 
##         cp = 0.0001, maxdepth = 5))
##   n= 32402 
## 
##         CP nsplit rel error xerror    xstd
## 1 0.120997      0     1.000  1.000 0.00997
## 2 0.061703      2     0.758  0.758 0.00900
## 3 0.034106      3     0.696  0.696 0.00870
## 4 0.011000      4     0.662  0.662 0.00852
## 5 0.003558      6     0.640  0.652 0.00847
## 6 0.000651      9     0.630  0.642 0.00842
## 7 0.000260     10     0.629  0.640 0.00841
## 
## Variable importance
## Married.civ.spouse      Never.married      education.num 
##                 27                 14                 12 
##       capital.gain      Not.in.family               Male 
##                 12                 10                  8 
##                age          Own.child     Prof.specialty 
##                  6                  4                  3 
##       capital.loss     hours.per.week 
##                  2                  1 
## 
## Node number 1: 32402 observations,    complexity param=0.121
##   predicted class=<=50K  expected loss=0.237  P(node) =1
##     class counts: 24720  7682
##    probabilities: 0.763 0.237 
##   left son=2 (17558 obs) right son=3 (14844 obs)
##   Primary splits:
##       Married.civ.spouse < 0.5   to the left,  improve=2300, (0 missing)
##       capital.gain       < 5120  to the left,  improve=1480, (0 missing)
##       education.num      < 12.5  to the left,  improve=1210, (0 missing)
##       Never.married      < 0.5   to the right, improve=1180, (0 missing)
##       age                < 29.5  to the left,  improve= 951, (0 missing)
##   Surrogate splits:
##       Never.married < 0.5   to the right, agree=0.787, adj=0.536, (0 split)
##       Not.in.family < 0.5   to the right, agree=0.713, adj=0.373, (0 split)
##       Male          < 0.5   to the left,  agree=0.688, adj=0.320, (0 split)
##       age           < 33.5  to the left,  agree=0.648, adj=0.231, (0 split)
##       Own.child     < 0.5   to the right, agree=0.609, adj=0.146, (0 split)
## 
## Node number 2: 17558 observations,    complexity param=0.0341
##   predicted class=<=50K  expected loss=0.0639  P(node) =0.542
##     class counts: 16436  1122
##    probabilities: 0.936 0.064 
##   left son=4 (17274 obs) right son=5 (284 obs)
##   Primary splits:
##       capital.gain   < 7070  to the left,  improve=465.0, (0 missing)
##       education.num  < 12.5  to the left,  improve=142.0, (0 missing)
##       hours.per.week < 43.5  to the left,  improve=106.0, (0 missing)
##       age            < 28.5  to the left,  improve= 68.8, (0 missing)
##       capital.loss   < 2370  to the left,  improve= 59.4, (0 missing)
## 
## Node number 3: 14844 observations,    complexity param=0.121
##   predicted class=<=50K  expected loss=0.442  P(node) =0.458
##     class counts:  8284  6560
##    probabilities: 0.558 0.442 
##   left son=6 (10475 obs) right son=7 (4369 obs)
##   Primary splits:
##       education.num   < 12.5  to the left,  improve=908, (0 missing)
##       capital.gain    < 5100  to the left,  improve=690, (0 missing)
##       Exec.managerial < 0.5   to the left,  improve=331, (0 missing)
##       Prof.specialty  < 0.5   to the left,  improve=319, (0 missing)
##       capital.loss    < 1780  to the left,  improve=268, (0 missing)
##   Surrogate splits:
##       Prof.specialty  < 0.5   to the left,  agree=0.791, adj=0.289, (0 split)
##       capital.gain    < 7490  to the left,  agree=0.717, adj=0.040, (0 split)
##       Exec.managerial < 0.5   to the left,  agree=0.712, adj=0.021, (0 split)
##       capital.loss    < 1890  to the left,  agree=0.711, adj=0.018, (0 split)
##       State.gov       < 0.5   to the left,  agree=0.707, adj=0.004, (0 split)
## 
## Node number 4: 17274 observations
##   predicted class=<=50K  expected loss=0.0491  P(node) =0.533
##     class counts: 16425   849
##    probabilities: 0.951 0.049 
## 
## Node number 5: 284 observations,    complexity param=0.000651
##   predicted class=>50K   expected loss=0.0387  P(node) =0.00876
##     class counts:    11   273
##    probabilities: 0.039 0.961 
##   left son=10 (5 obs) right son=11 (279 obs)
##   Primary splits:
##       capital.gain      < 31000 to the right, improve=9.41, (0 missing)
##       age               < 21    to the left,  improve=7.50, (0 missing)
##       Handlers.cleaners < 0.5   to the right, improve=3.72, (0 missing)
##       hours.per.week    < 35.5  to the left,  improve=3.04, (0 missing)
##       education.num     < 10.5  to the left,  improve=1.87, (0 missing)
##   Surrogate splits:
##       age < 21    to the left,  agree=0.996, adj=0.8, (0 split)
## 
## Node number 6: 10475 observations,    complexity param=0.0617
##   predicted class=<=50K  expected loss=0.329  P(node) =0.323
##     class counts:  7029  3446
##    probabilities: 0.671 0.329 
##   left son=12 (9979 obs) right son=13 (496 obs)
##   Primary splits:
##       capital.gain    < 5100  to the left,  improve=438, (0 missing)
##       education.num   < 8.5   to the left,  improve=178, (0 missing)
##       age             < 35.5  to the left,  improve=134, (0 missing)
##       Exec.managerial < 0.5   to the left,  improve=125, (0 missing)
##       capital.loss    < 1780  to the left,  improve=118, (0 missing)
## 
## Node number 7: 4369 observations,    complexity param=0.00356
##   predicted class=>50K   expected loss=0.287  P(node) =0.135
##     class counts:  1255  3114
##    probabilities: 0.287 0.713 
##   left son=14 (3788 obs) right son=15 (581 obs)
##   Primary splits:
##       capital.gain   < 5100  to the left,  improve=107.0, (0 missing)
##       capital.loss   < 1780  to the left,  improve= 56.8, (0 missing)
##       hours.per.week < 31    to the left,  improve= 54.9, (0 missing)
##       age            < 28.5  to the left,  improve= 40.6, (0 missing)
##       education.num  < 13.5  to the left,  improve= 31.9, (0 missing)
## 
## Node number 10: 5 observations
##   predicted class=<=50K  expected loss=0  P(node) =0.000154
##     class counts:     5     0
##    probabilities: 1.000 0.000 
## 
## Node number 11: 279 observations
##   predicted class=>50K   expected loss=0.0215  P(node) =0.00861
##     class counts:     6   273
##    probabilities: 0.022 0.978 
## 
## Node number 12: 9979 observations,    complexity param=0.011
##   predicted class=<=50K  expected loss=0.297  P(node) =0.308
##     class counts:  7018  2961
##    probabilities: 0.703 0.297 
##   left son=24 (1656 obs) right son=25 (8323 obs)
##   Primary splits:
##       education.num   < 8.5   to the left,  improve=152, (0 missing)
##       capital.loss    < 1780  to the left,  improve=139, (0 missing)
##       age             < 35.5  to the left,  improve=108, (0 missing)
##       Exec.managerial < 0.5   to the left,  improve=102, (0 missing)
##       hours.per.week  < 34.5  to the left,  improve= 60, (0 missing)
##   Surrogate splits:
##       age < 17.5  to the left,  agree=0.834, adj=0.001, (0 split)
## 
## Node number 13: 496 observations
##   predicted class=>50K   expected loss=0.0222  P(node) =0.0153
##     class counts:    11   485
##    probabilities: 0.022 0.978 
## 
## Node number 14: 3788 observations,    complexity param=0.00356
##   predicted class=>50K   expected loss=0.331  P(node) =0.117
##     class counts:  1252  2536
##    probabilities: 0.331 0.669 
##   left son=28 (3390 obs) right son=29 (398 obs)
##   Primary splits:
##       capital.loss   < 1780  to the left,  improve=78.9, (0 missing)
##       hours.per.week < 31    to the left,  improve=52.3, (0 missing)
##       age            < 28.5  to the left,  improve=33.9, (0 missing)
##       education.num  < 13.5  to the left,  improve=30.7, (0 missing)
##       capital.gain   < 3120  to the right, improve=30.0, (0 missing)
## 
## Node number 15: 581 observations
##   predicted class=>50K   expected loss=0.00516  P(node) =0.0179
##     class counts:     3   578
##    probabilities: 0.005 0.995 
## 
## Node number 24: 1656 observations
##   predicted class=<=50K  expected loss=0.101  P(node) =0.0511
##     class counts:  1489   167
##    probabilities: 0.899 0.101 
## 
## Node number 25: 8323 observations,    complexity param=0.011
##   predicted class=<=50K  expected loss=0.336  P(node) =0.257
##     class counts:  5529  2794
##    probabilities: 0.664 0.336 
##   left son=50 (7988 obs) right son=51 (335 obs)
##   Primary splits:
##       capital.loss    < 1780  to the left,  improve=121.0, (0 missing)
##       age             < 35.5  to the left,  improve=121.0, (0 missing)
##       Exec.managerial < 0.5   to the left,  improve= 77.0, (0 missing)
##       education.num   < 9.5   to the left,  improve= 61.8, (0 missing)
##       hours.per.week  < 34.5  to the left,  improve= 57.2, (0 missing)
## 
## Node number 28: 3390 observations,    complexity param=0.00356
##   predicted class=>50K   expected loss=0.365  P(node) =0.105
##     class counts:  1239  2151
##    probabilities: 0.365 0.635 
##   left son=56 (306 obs) right son=57 (3084 obs)
##   Primary splits:
##       hours.per.week  < 31    to the left,  improve=48.5, (0 missing)
##       age             < 28.5  to the left,  improve=29.9, (0 missing)
##       capital.gain    < 3120  to the right, improve=25.8, (0 missing)
##       Exec.managerial < 0.5   to the left,  improve=24.8, (0 missing)
##       education.num   < 13.5  to the left,  improve=24.1, (0 missing)
##   Surrogate splits:
##       age < 66.5  to the right, agree=0.917, adj=0.075, (0 split)
## 
## Node number 29: 398 observations
##   predicted class=>50K   expected loss=0.0327  P(node) =0.0123
##     class counts:    13   385
##    probabilities: 0.033 0.967 
## 
## Node number 50: 7988 observations
##   predicted class=<=50K  expected loss=0.318  P(node) =0.247
##     class counts:  5446  2542
##    probabilities: 0.682 0.318 
## 
## Node number 51: 335 observations
##   predicted class=>50K   expected loss=0.248  P(node) =0.0103
##     class counts:    83   252
##    probabilities: 0.248 0.752 
## 
## Node number 56: 306 observations
##   predicted class=<=50K  expected loss=0.366  P(node) =0.00944
##     class counts:   194   112
##    probabilities: 0.634 0.366 
## 
## Node number 57: 3084 observations
##   predicted class=>50K   expected loss=0.339  P(node) =0.0952
##     class counts:  1045  2039
##    probabilities: 0.339 0.661
#Variable importance
varimp <- varImp(treepruned)
varimp
##                           Overall
## age                   1494.555990
## capital.gain          3240.846180
## capital.loss           841.558520
## education.num         2742.553467
## Exec.managerial        660.776425
## Handlers.cleaners        3.722355
## hours.per.week         382.023696
## Married.civ.spouse    2298.950331
## Never.married         1175.483228
## Prof.specialty         318.529199
## Local.gov                0.000000
## No.gain                  0.000000
## Private                  0.000000
## Self.emp.inc             0.000000
## Self.emp.not.inc         0.000000
## State.gov                0.000000
## Married.AF.spouse        0.000000
## Married.spouse.absent    0.000000
## Separated                0.000000
## Widowed                  0.000000
## Armed.Forces             0.000000
## Craft.repair             0.000000
## Farming.fishing          0.000000
## Machine.op.inspct        0.000000
## Other.service            0.000000
## Priv.house.serv          0.000000
## Protective.serv          0.000000
## Sales                    0.000000
## Tech.support             0.000000
## Transport.moving         0.000000
## Not.in.family            0.000000
## Other.relative           0.000000
## Own.child                0.000000
## Unmarried                0.000000
## Wife                     0.000000
## Asian.Pac.Islander       0.000000
## Black                    0.000000
## Other                    0.000000
## White                    0.000000
## Male                     0.000000
## other_countries          0.000000
## Philippines              0.000000
## United.States            0.000000
#Visualization of variable importance 
varimp <- data.frame(varimp, name = rownames(varimp))
ggplot(varimp, aes(x = reorder(name, -Overall), y = Overall)) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(color = "blue", size = 6, angle = 90))

#Visualization of pruned tree
rpart.plot(treepruned)

#predicted income class from pruned tree object on train dataset
treepred1 <- predict(treepruned, newdata = newtrain2, type = "class")



#Confusion matrix - train dataset
confusion1 <- confusionMatrix(newtrain2$income, treepred1)
confusion1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 23559  1161
##      >50K   3670  4012
##                                                
##                Accuracy : 0.8509               
##                  95% CI : (0.847, 0.8548)      
##     No Information Rate : 0.8403               
##     P-Value [Acc > NIR] : 0.0000000869         
##                                                
##                   Kappa : 0.5356               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.8652               
##             Specificity : 0.7756               
##          Pos Pred Value : 0.9530               
##          Neg Pred Value : 0.5223               
##              Prevalence : 0.8403               
##          Detection Rate : 0.7271               
##    Detection Prevalence : 0.7629               
##       Balanced Accuracy : 0.8204               
##                                                
##        'Positive' Class : <=50K                
## 
#Training accuracy rate
(confusion1$table[1, 1] + confusion1$table[2, 2]) / sum(confusion1$table)
## [1] 0.8509043
treepred2 <- predict(treepruned, newdata = newtest2, type="class")



#Confusion matrix - test dataset
confusion2 <- confusionMatrix(newtest2$income, treepred2)
confusion2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11872   563
##      >50K   1837  1924
##                                               
##                Accuracy : 0.8518              
##                  95% CI : (0.8462, 0.8573)    
##     No Information Rate : 0.8464              
##     P-Value [Acc > NIR] : 0.02926             
##                                               
##                   Kappa : 0.5288              
##  Mcnemar's Test P-Value : < 0.0000000000000002
##                                               
##             Sensitivity : 0.8660              
##             Specificity : 0.7736              
##          Pos Pred Value : 0.9547              
##          Neg Pred Value : 0.5116              
##              Prevalence : 0.8464              
##          Detection Rate : 0.7330              
##    Detection Prevalence : 0.7678              
##       Balanced Accuracy : 0.8198              
##                                               
##        'Positive' Class : <=50K               
## 
#Misclassification Rate of prunned tree on test dataset
(confusion2$table[1, 2] + confusion2$table[2, 1]) / sum(confusion2$table)
## [1] 0.1481847
#Accuracy Rate of prunned tree on test dataset
(confusion2$table[1, 1] +confusion2$table[2, 2]) / sum(confusion2$table)
## [1] 0.8518153
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Baseline model's ROC curve
#Getting predicted >50K of income probabilities 
tree_prob <- predict(tree, newdata = newtest2, type = "prob")[, 2]
tree_prediction <- prediction(tree_prob, newtest2$income)
tree_performance <- ROCR::performance(tree_prediction, measure = "tpr", x.measure = "fpr")



#Plot ROC curve 
plot(tree_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
tree.auc <- ROCR::performance(tree_prediction, measure = "auc")@y.values[[1]]
tree.auc
## [1] 0.8768653
#Pick the best threshold
str(tree_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:15] 0 0.000241 0.000241 0.000483 0.001045 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:15] 0 0.033 0.107 0.166 0.213 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:15] Inf 1 0.995 0.982 0.967 ...
cutoffs <- data.frame(cut = tree_performance@alpha.values[[1]], 
                      fpr = tree_performance@x.values[[1]], 
                      tpr = tree_performance@y.values[[1]])
head(cutoffs)
##         cut          fpr        tpr
## 1       Inf 0.0000000000 0.00000000
## 2 1.0000000 0.0002412545 0.03296995
## 3 0.9948365 0.0002412545 0.10688647
## 4 0.9817814 0.0004825090 0.16591332
## 5 0.9673367 0.0010454363 0.21270939
## 6 0.7522388 0.0038600724 0.24541345
roc <- pROC::roc(newtest2$income, tree_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is :  " , threshold, "\n")
## The best threshold is :   0.1993402
#Get accuracy rate of testset data using the optimal threshold  ****
confusionMatrix(tree_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  9084  491
##      TRUE   3351 3270
##                                              
##                Accuracy : 0.7628             
##                  95% CI : (0.7562, 0.7693)   
##     No Information Rate : 0.7678             
##     P-Value [Acc > NIR] : 0.9351             
##                                              
##                   Kappa : 0.4742             
##  Mcnemar's Test P-Value : <0.0000000000000002
##                                              
##             Sensitivity : 0.7305             
##             Specificity : 0.8694             
##          Pos Pred Value : 0.9487             
##          Neg Pred Value : 0.4939             
##              Prevalence : 0.7678             
##          Detection Rate : 0.5609             
##    Detection Prevalence : 0.5912             
##       Balanced Accuracy : 0.8000             
##                                              
##        'Positive' Class : FALSE              
## 
#Pruned model's ROC curve
#Getting predicted >50K of income probabilities 
pruned_prob <- predict(treepruned, newdata = newtest2, type = "prob")[, 2]
pruned_prediction <- prediction(pruned_prob, newtest2$income)
pruned_performance <- ROCR::performance(pruned_prediction, measure = "tpr", x.measure = "fpr")



#Plot ROC curve 
plot(pruned_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
pruned.auc <- ROCR::performance(pruned_prediction,
                                measure="auc")@y.values[[1]]
pruned.auc
## [1] 0.858427
#Pick the best threshold
str(pruned_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:12] 0 0 0.000322 0.000643 0.001206 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:12] 0 0.0739 0.1072 0.1662 0.213 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:12] Inf 0.995 0.978 0.978 0.967 ...
cutoffs <- data.frame(cut = pruned_performance@alpha.values[[1]], 
                      fpr = pruned_performance@x.values[[1]], 
                      tpr = pruned_performance@y.values[[1]])
head(cutoffs)
##         cut          fpr        tpr
## 1       Inf 0.0000000000 0.00000000
## 2 0.9948365 0.0000000000 0.07391651
## 3 0.9784946 0.0003216727 0.10715235
## 4 0.9778226 0.0006433454 0.16617921
## 5 0.9673367 0.0012062726 0.21297527
## 6 0.7522388 0.0040209087 0.24567934
roc <- pROC::roc(newtest2$income, pruned_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is :  " , threshold, "\n")
## The best threshold is :   0.2095364
#Get accuracy rate of testset data using the optimal threshold  ****
confusionMatrix(pruned_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  9098  504
##      TRUE   3337 3257
##                                              
##                Accuracy : 0.7628             
##                  95% CI : (0.7562, 0.7694)   
##     No Information Rate : 0.7678             
##     P-Value [Acc > NIR] : 0.9327             
##                                              
##                   Kappa : 0.4733             
##  Mcnemar's Test P-Value : <0.0000000000000002
##                                              
##             Sensitivity : 0.7316             
##             Specificity : 0.8660             
##          Pos Pred Value : 0.9475             
##          Neg Pred Value : 0.4939             
##              Prevalence : 0.7678             
##          Detection Rate : 0.5617             
##    Detection Prevalence : 0.5929             
##       Balanced Accuracy : 0.7988             
##                                              
##        'Positive' Class : FALSE              
## 

\(\\\)

\(\\\)

Classification tree Using R part function - tuning method 2

set.seed(100)



trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

# Training the Decision Tree classifier with criterion as gini index
dtree_fit <- caret::train(income ~., data = newtrain2,
                          method = "rpart",
                   parms = list(split = "gini"),
                   trControl = trctrl,
                   tuneLength = 10)
dtree_fit
## CART 
## 
## 32402 samples
##    43 predictor
##     2 classes: '<=50K', '>50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.001171570  0.8578071  0.5689948
##   0.001366832  0.8577248  0.5670715
##   0.002212965  0.8565314  0.5584858
##   0.002629524  0.8552043  0.5505177
##   0.003558101  0.8514082  0.5409312
##   0.006769071  0.8451535  0.5205961
##   0.010999740  0.8448962  0.5199522
##   0.034105702  0.8385798  0.4924548
##   0.061702682  0.8264613  0.4422631
##   0.120997136  0.7879256  0.1876654
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.00117157.
#Tuning parameter - cp
dtree_fit$bestTune
##           cp
## 1 0.00117157
#The model we selected by using the optimal cp we got
dtree_fit$finalModel
## n= 32402 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 32402 7682 <=50K (0.762915869 0.237084131)  
##      2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)  
##        4) capital.gain< 7073.5 17274  849 <=50K (0.950850990 0.049149010) *
##        5) capital.gain>=7073.5 284   11 >50K (0.038732394 0.961267606) *
##      3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)  
##        6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)  
##         12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)  
##           24) education.num< 8.5 1656  167 <=50K (0.899154589 0.100845411) *
##           25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)  
##             50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341)  
##              100) age< 35.5 2704  552 <=50K (0.795857988 0.204142012) *
##              101) age>=35.5 5284 1990 <=50K (0.623391370 0.376608630)  
##                202) hours.per.week< 34.5 578   91 <=50K (0.842560554 0.157439446) *
##                203) hours.per.week>=34.5 4706 1899 <=50K (0.596472588 0.403527412)  
##                  406) education.num< 9.5 2622  908 <=50K (0.653699466 0.346300534) *
##                  407) education.num>=9.5 2084  991 <=50K (0.524472169 0.475527831)  
##                    814) Self.emp.not.inc>=0.5 245   69 <=50K (0.718367347 0.281632653) *
##                    815) Self.emp.not.inc< 0.5 1839  917 >50K (0.498640566 0.501359434)  
##                     1630) Exec.managerial< 0.5 1504  704 <=50K (0.531914894 0.468085106)  
##                       3260) Tech.support< 0.5 1390  629 <=50K (0.547482014 0.452517986) *
##                       3261) Tech.support>=0.5 114   39 >50K (0.342105263 0.657894737) *
##                     1631) Exec.managerial>=0.5 335  117 >50K (0.349253731 0.650746269) *
##             51) capital.loss>=1782.5 335   83 >50K (0.247761194 0.752238806)  
##              102) capital.loss>=1989.5 94   21 <=50K (0.776595745 0.223404255) *
##              103) capital.loss< 1989.5 241   10 >50K (0.041493776 0.958506224) *
##         13) capital.gain>=5095.5 496   11 >50K (0.022177419 0.977822581) *
##        7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)  
##         14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)  
##           28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)  
##             56) hours.per.week< 31 306  112 <=50K (0.633986928 0.366013072)  
##              112) Wife< 0.5 233   67 <=50K (0.712446352 0.287553648) *
##              113) Wife>=0.5 73   28 >50K (0.383561644 0.616438356) *
##             57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345)  
##              114) age< 28.5 211   89 <=50K (0.578199052 0.421800948) *
##              115) age>=28.5 2873  923 >50K (0.321266968 0.678733032)  
##                230) capital.gain>=3120 52   11 <=50K (0.788461538 0.211538462) *
##                231) capital.gain< 3120 2821  882 >50K (0.312655087 0.687344913)  
##                  462) Exec.managerial< 0.5 1998  695 >50K (0.347847848 0.652152152)  
##                    924) Prof.specialty< 0.5 911  398 >50K (0.436882547 0.563117453)  
##                     1848) Other.service>=0.5 35    8 <=50K (0.771428571 0.228571429) *
##                     1849) Other.service< 0.5 876  371 >50K (0.423515982 0.576484018)  
##                       3698) Self.emp.not.inc>=0.5 123   50 <=50K (0.593495935 0.406504065) *
##                       3699) Self.emp.not.inc< 0.5 753  298 >50K (0.395750332 0.604249668) *
##                    925) Prof.specialty>=0.5 1087  297 >50K (0.273229071 0.726770929) *
##                  463) Exec.managerial>=0.5 823  187 >50K (0.227217497 0.772782503) *
##           29) capital.loss>=1782.5 398   13 >50K (0.032663317 0.967336683) *
##         15) capital.gain>=5095.5 581    3 >50K (0.005163511 0.994836489) *
#Plot classification tree 
prp(dtree_fit$finalModel, box.palette = "Reds", tweak = 0.8, 
    fallen.leaves = FALSE, faclen = 0, extra = 1)

#Variable importance
varimp2 <- varImp(dtree_fit$finalModel)
varimp2
##                           Overall
## age                   1632.953509
## capital.gain          3279.194816
## capital.loss           979.941955
## education.num         2956.577327
## Exec.managerial        902.677175
## Farming.fishing        120.791701
## Handlers.cleaners       19.440365
## hours.per.week         508.081524
## Local.gov                1.464908
## Male                    10.337328
## Married.civ.spouse    2298.950331
## Never.married         1175.483228
## other_countries         11.895201
## Other.service          155.426390
## Prof.specialty         347.449953
## Sales                   12.786564
## Self.emp.not.inc       127.720699
## Tech.support             8.888007
## Transport.moving         5.897710
## Wife                    12.024713
## No.gain                  0.000000
## Private                  0.000000
## Self.emp.inc             0.000000
## State.gov                0.000000
## Married.AF.spouse        0.000000
## Married.spouse.absent    0.000000
## Separated                0.000000
## Widowed                  0.000000
## Armed.Forces             0.000000
## Craft.repair             0.000000
## Machine.op.inspct        0.000000
## Priv.house.serv          0.000000
## Protective.serv          0.000000
## Not.in.family            0.000000
## Other.relative           0.000000
## Own.child                0.000000
## Unmarried                0.000000
## Asian.Pac.Islander       0.000000
## Black                    0.000000
## Other                    0.000000
## White                    0.000000
## Philippines              0.000000
## United.States            0.000000
#Visualization of variable importance 
varimp2 <- data.frame(varimp2, name = rownames(varimp2))
ggplot(varimp2, aes(x = reorder(name, -Overall), y = Overall)) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(color = "blue", size = 6, angle = 90))

#Predicted income class from the finalmodel tree object on train dataset
treepred3 <- predict(dtree_fit$finalModel, newdata = newtrain2, type = "class")



#Confusion matrix - train dataset
confusion3 <- confusionMatrix(newtrain2$income, treepred3)
confusion3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 23706  1014
##      >50K   3511  4171
##                                                
##                Accuracy : 0.8603               
##                  95% CI : (0.8565, 0.8641)     
##     No Information Rate : 0.84                 
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5653               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.8710               
##             Specificity : 0.8044               
##          Pos Pred Value : 0.9590               
##          Neg Pred Value : 0.5430               
##              Prevalence : 0.8400               
##          Detection Rate : 0.7316               
##    Detection Prevalence : 0.7629               
##       Balanced Accuracy : 0.8377               
##                                                
##        'Positive' Class : <=50K                
## 
#Training accuracy rate
(confusion3$table[1,1] + confusion3$table[2,2]) / sum(confusion3$table)
## [1] 0.8603481
#Predicted income class from the finalmodel tree object on test dataset
treepred4 <- predict(dtree_fit$finalModel, newdata = newtest2, type = "class")



#Confusion matrix - test dataset
confusion4 <- confusionMatrix(newtest2$income, treepred4)
confusion4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11918   517
##      >50K   1747  2014
##                                                
##                Accuracy : 0.8602               
##                  95% CI : (0.8548, 0.8655)     
##     No Information Rate : 0.8437               
##     P-Value [Acc > NIR] : 0.000000002442       
##                                                
##                   Kappa : 0.5575               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.8722               
##             Specificity : 0.7957               
##          Pos Pred Value : 0.9584               
##          Neg Pred Value : 0.5355               
##              Prevalence : 0.8437               
##          Detection Rate : 0.7359               
##    Detection Prevalence : 0.7678               
##       Balanced Accuracy : 0.8339               
##                                                
##        'Positive' Class : <=50K                
## 
#Misclassification Rate of finalmodel tree on test dataset
(confusion4$table[1, 2] + confusion4$table[2, 1]) / sum(confusion4$table)
## [1] 0.1397876
#Accuracy Rate of finalmodel tree on test dataset
(confusion4$table[1, 1] + confusion4$table[2, 2]) / sum(confusion4$table)
## [1] 0.8602124
#Getting predicted >50K of income probabilities 
gini_prob <- predict(dtree_fit, newdata = newtest2, type = "prob")[, 2]
gini_prediction <- prediction(gini_prob, newtest2$income)
gini_performance <- ROCR::performance(gini_prediction, measure = "tpr", x.measure = "fpr")



#ROC Curve  : https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Plot ROC curve 
plot(gini_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
gini.auc <- ROCR::performance(gini_prediction, measure="auc")@y.values[[1]]
gini.auc
## [1] 0.8718559
#Pick the best threshold
str(gini_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:25] 0 0 0.000322 0.000885 0.001287 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:25] 0 0.0739 0.1329 0.1797 0.213 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:25] Inf 0.995 0.978 0.967 0.961 ...
cutoffs <- data.frame(cut = gini_performance@alpha.values[[1]], 
                      fpr = gini_performance@x.values[[1]], 
                      tpr = gini_performance@y.values[[1]])
head(cutoffs)
##         cut          fpr        tpr
## 1       Inf 0.0000000000 0.00000000
## 2 0.9948365 0.0000000000 0.07391651
## 3 0.9778226 0.0003216727 0.13294337
## 4 0.9673367 0.0008845999 0.17973943
## 5 0.9612676 0.0012866908 0.21297527
## 6 0.9585062 0.0016887817 0.24381813
roc <- pROC::roc(newtest2$income, gini_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is :  ", threshold, "\n")
## The best threshold is :   0.2259878
#Get accuracy rate of testset data using the optimal threshold  ****
confusionMatrix(gini_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 10364   861
##      TRUE   2071  2900
##                                                
##                Accuracy : 0.819                
##                  95% CI : (0.8129, 0.8249)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5435               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.8335               
##             Specificity : 0.7711               
##          Pos Pred Value : 0.9233               
##          Neg Pred Value : 0.5834               
##              Prevalence : 0.7678               
##          Detection Rate : 0.6399               
##    Detection Prevalence : 0.6931               
##       Balanced Accuracy : 0.8023               
##                                                
##        'Positive' Class : FALSE                
## 
#====================================================================



#Training the Decision Tree classifier with criterion as information gain(cross entropy)
set.seed(100)
dtree_fit_info <- caret::train(income ~., data = newtrain2, method = "rpart",
                   parms = list(split = "information"),
                   trControl = trctrl,
                   tuneLength = 10)
dtree_fit_info
## CART 
## 
## 32402 samples
##    43 predictor
##     2 classes: '<=50K', '>50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.001171570  0.8576940  0.5683314
##   0.001366832  0.8572001  0.5639238
##   0.002212965  0.8547208  0.5507302
##   0.002629524  0.8536818  0.5444061
##   0.003558101  0.8501429  0.5352532
##   0.006769071  0.8442687  0.5163148
##   0.010999740  0.8432194  0.5114075
##   0.034105702  0.8385798  0.4924548
##   0.061702682  0.8264613  0.4422631
##   0.120997136  0.7879256  0.1876654
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.00117157.
#Tuning parameter - cp
dtree_fit_info$bestTune
##           cp
## 1 0.00117157
#The model we selected by using the optimal cp we got
dtree_fit_info$finalModel
## n= 32402 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##        1) root 32402 7682 <=50K (0.762915869 0.237084131)  
##          2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)  
##            4) capital.gain< 7073.5 17274  849 <=50K (0.950850990 0.049149010) *
##            5) capital.gain>=7073.5 284   11 >50K (0.038732394 0.961267606) *
##          3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)  
##            6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)  
##             12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)  
##               24) education.num< 8.5 1656  167 <=50K (0.899154589 0.100845411) *
##               25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)  
##                 50) age< 35.5 2782  599 <=50K (0.784687275 0.215312725)  
##                  100) age< 24.5 338   19 <=50K (0.943786982 0.056213018) *
##                  101) age>=24.5 2444  580 <=50K (0.762684124 0.237315876)  
##                    202) capital.loss< 1794 2371  533 <=50K (0.775200337 0.224799663) *
##                    203) capital.loss>=1794 73   26 >50K (0.356164384 0.643835616)  
##                      406) capital.loss>=1989.5 25    1 <=50K (0.960000000 0.040000000) *
##                      407) capital.loss< 1989.5 48    2 >50K (0.041666667 0.958333333) *
##                 51) age>=35.5 5541 2195 <=50K (0.603862119 0.396137881)  
##                  102) capital.loss< 1782.5 5284 1990 <=50K (0.623391370 0.376608630)  
##                    204) hours.per.week< 34.5 578   91 <=50K (0.842560554 0.157439446) *
##                    205) hours.per.week>=34.5 4706 1899 <=50K (0.596472588 0.403527412)  
##                      410) education.num< 9.5 2622  908 <=50K (0.653699466 0.346300534) *
##                      411) education.num>=9.5 2084  991 <=50K (0.524472169 0.475527831)  
##                        822) Self.emp.not.inc>=0.5 245   69 <=50K (0.718367347 0.281632653) *
##                        823) Self.emp.not.inc< 0.5 1839  917 >50K (0.498640566 0.501359434)  
##                         1646) Exec.managerial< 0.5 1504  704 <=50K (0.531914894 0.468085106)  
##                           3292) Handlers.cleaners>=0.5 38    5 <=50K (0.868421053 0.131578947) *
##                           3293) Handlers.cleaners< 0.5 1466  699 <=50K (0.523192360 0.476807640)  
##                             6586) Other.service>=0.5 59   13 <=50K (0.779661017 0.220338983) *
##                             6587) Other.service< 0.5 1407  686 <=50K (0.512437811 0.487562189)  
##                              13174) capital.loss>=1532 12    0 <=50K (1.000000000 0.000000000) *
##                              13175) capital.loss< 1532 1395  686 <=50K (0.508243728 0.491756272)  
##                                26350) Transport.moving>=0.5 102   31 <=50K (0.696078431 0.303921569) *
##                                26351) Transport.moving< 0.5 1293  638 >50K (0.493426141 0.506573859)  
##                                  52702) capital.gain>=4699.5 10    0 <=50K (1.000000000 0.000000000) *
##                                  52703) capital.gain< 4699.5 1283  628 >50K (0.489477786 0.510522214)  
##                                   105406) Tech.support< 0.5 1171  580 <=50K (0.504696840 0.495303160)  
##                                     210812) Machine.op.inspct>=0.5 95   32 <=50K (0.663157895 0.336842105) *
##                                     210813) Machine.op.inspct< 0.5 1076  528 >50K (0.490706320 0.509293680)  
##                                       421626) age>=58.5 91   32 <=50K (0.648351648 0.351648352) *
##                                       421627) age< 58.5 985  469 >50K (0.476142132 0.523857868)  
##                                         843254) hours.per.week< 43.5 623  306 <=50K (0.508828250 0.491171750)  
##                                          1686508) capital.gain< 4225 616  299 <=50K (0.514610390 0.485389610)  
##                                            3373016) capital.gain>=3120 8    0 <=50K (1.000000000 0.000000000) *
##                                            3373017) capital.gain< 3120 608  299 <=50K (0.508223684 0.491776316)  
##                                              6746034) Protective.serv< 0.5 566  268 <=50K (0.526501767 0.473498233)  
##                                               13492068) Prof.specialty< 0.5 477  213 <=50K (0.553459119 0.446540881)  
##                                                 26984136) age< 47.5 315  124 <=50K (0.606349206 0.393650794) *
##                                                 26984137) age>=47.5 162   73 >50K (0.450617284 0.549382716) *
##                                               13492069) Prof.specialty>=0.5 89   34 >50K (0.382022472 0.617977528) *
##                                              6746035) Protective.serv>=0.5 42   11 >50K (0.261904762 0.738095238) *
##                                          1686509) capital.gain>=4225 7    0 >50K (0.000000000 1.000000000) *
##                                         843255) hours.per.week>=43.5 362  152 >50K (0.419889503 0.580110497) *
##                                   105407) Tech.support>=0.5 112   37 >50K (0.330357143 0.669642857) *
##                         1647) Exec.managerial>=0.5 335  117 >50K (0.349253731 0.650746269) *
##                  103) capital.loss>=1782.5 257   52 >50K (0.202334630 0.797665370)  
##                    206) capital.loss>=1989.5 66   20 <=50K (0.696969697 0.303030303) *
##                    207) capital.loss< 1989.5 191    6 >50K (0.031413613 0.968586387) *
##             13) capital.gain>=5095.5 496   11 >50K (0.022177419 0.977822581) *
##            7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)  
##             14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)  
##               28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)  
##                 56) hours.per.week< 31 306  112 <=50K (0.633986928 0.366013072)  
##                  112) Wife< 0.5 233   67 <=50K (0.712446352 0.287553648) *
##                  113) Wife>=0.5 73   28 >50K (0.383561644 0.616438356) *
##                 57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345)  
##                  114) age< 28.5 211   89 <=50K (0.578199052 0.421800948) *
##                  115) age>=28.5 2873  923 >50K (0.321266968 0.678733032)  
##                    230) capital.gain>=3120 52   11 <=50K (0.788461538 0.211538462) *
##                    231) capital.gain< 3120 2821  882 >50K (0.312655087 0.687344913)  
##                      462) Exec.managerial< 0.5 1998  695 >50K (0.347847848 0.652152152)  
##                        924) Prof.specialty< 0.5 911  398 >50K (0.436882547 0.563117453)  
##                         1848) Other.service>=0.5 35    8 <=50K (0.771428571 0.228571429) *
##                         1849) Other.service< 0.5 876  371 >50K (0.423515982 0.576484018)  
##                           3698) Self.emp.not.inc>=0.5 123   50 <=50K (0.593495935 0.406504065) *
##                           3699) Self.emp.not.inc< 0.5 753  298 >50K (0.395750332 0.604249668) *
##                        925) Prof.specialty>=0.5 1087  297 >50K (0.273229071 0.726770929) *
##                      463) Exec.managerial>=0.5 823  187 >50K (0.227217497 0.772782503) *
##               29) capital.loss>=1782.5 398   13 >50K (0.032663317 0.967336683) *
##             15) capital.gain>=5095.5 581    3 >50K (0.005163511 0.994836489) *
#Plot classification tree
prp(dtree_fit_info$finalModel, box.palette = "Blues", tweak = 1.2, extra = 1)

#Variable importance
varimp3 <- varImp(dtree_fit_info$finalModel)
varimp3
##                            Overall
## age                   2666.4332900
## capital.gain          4204.0735108
## capital.loss          1246.0965411
## Craft.repair            19.5942637
## education.num         3823.9420314
## Exec.managerial        970.5439782
## Farming.fishing        224.4187849
## Handlers.cleaners       21.8938004
## hours.per.week         895.4359272
## Machine.op.inspct       27.5632778
## Male                    10.8358134
## Married.civ.spouse    3386.3856340
## Never.married         1987.9976540
## other_countries         13.0766863
## Other.service          137.7986339
## Own.child              230.5841961
## Private                  2.5392738
## Prof.specialty         368.7640471
## Protective.serv         18.2255630
## Sales                   17.5572307
## Self.emp.inc             0.8764151
## Self.emp.not.inc       139.2962504
## Tech.support            47.1275640
## Transport.moving        22.3790987
## United.States            2.6915940
## Wife                    12.5911026
## Local.gov                0.0000000
## No.gain                  0.0000000
## State.gov                0.0000000
## Married.AF.spouse        0.0000000
## Married.spouse.absent    0.0000000
## Separated                0.0000000
## Widowed                  0.0000000
## Armed.Forces             0.0000000
## Priv.house.serv          0.0000000
## Not.in.family            0.0000000
## Other.relative           0.0000000
## Unmarried                0.0000000
## Asian.Pac.Islander       0.0000000
## Black                    0.0000000
## Other                    0.0000000
## White                    0.0000000
## Philippines              0.0000000
#Visualization of variable importance 
varimp3 <- data.frame(varimp3, name = rownames(varimp3))
ggplot(varimp2, aes(x = reorder(name, -Overall), y = Overall)) + 
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(color = "blue", size = 6, angle = 90))

#Predicted income class from the finalmodel tree object on train dataset
treepred <- predict(dtree_fit_info$finalModel, newdata = newtrain2, type = "class")



#Confusion matrix - train dataset
confusion <- confusionMatrix(newtrain2$income, treepred)
confusion
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 23440  1280
##      >50K   3119  4563
##                                                
##                Accuracy : 0.8642               
##                  95% CI : (0.8605, 0.8679)     
##     No Information Rate : 0.8197               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.591                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.8826               
##             Specificity : 0.7809               
##          Pos Pred Value : 0.9482               
##          Neg Pred Value : 0.5940               
##              Prevalence : 0.8197               
##          Detection Rate : 0.7234               
##    Detection Prevalence : 0.7629               
##       Balanced Accuracy : 0.8317               
##                                                
##        'Positive' Class : <=50K                
## 
#Training accuracy rate
(confusion$table[1,1] + confusion$table[2,2]) / sum(confusion$table)
## [1] 0.8642368
#Predicted income class from the finalmodel tree object on test dataset
treepred1 <- predict(dtree_fit_info$finalModel, newdata = newtest2, type = "class")



#Confusion matrix - test dataset
confusion1 <- confusionMatrix(newtest2$income, treepred1)
confusion1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11767   668
##      >50K   1585  2176
##                                                
##                Accuracy : 0.8609               
##                  95% CI : (0.8555, 0.8662)     
##     No Information Rate : 0.8244               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5736               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.8813               
##             Specificity : 0.7651               
##          Pos Pred Value : 0.9463               
##          Neg Pred Value : 0.5786               
##              Prevalence : 0.8244               
##          Detection Rate : 0.7265               
##    Detection Prevalence : 0.7678               
##       Balanced Accuracy : 0.8232               
##                                                
##        'Positive' Class : <=50K                
## 
#Misclassification Rate of finalmodel tree on test dataset
(confusion1$table[1, 2] + confusion1$table[2, 1]) / sum(confusion1$table)
## [1] 0.1391084
#Accuracy Rate of finalmodel tree on test dataset
(confusion1$table[1, 1] + confusion1$table[2, 2]) / sum(confusion1$table)
## [1] 0.8608916
#Getting predicted >50K of income probabilities 
info_prob <- predict(dtree_fit_info, newdata = newtest2, type = "prob")[, 2]
info_prediction <- prediction(info_prob, newtest2$income)
info_performance <- ROCR::performance(info_prediction, measure = "tpr", x.measure = "fpr")



#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Plot ROC curve 
plot(info_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
info.auc <- ROCR::performance(info_prediction, measure = "auc")@y.values[[1]]
info.auc
## [1] 0.872279
#Pick the best threshold======================= not for accuracy
str(info_performance)
## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:39] 0 0 0 0.000322 0.000643 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:39] 0 0.000798 0.074714 0.133741 0.157937 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:39] Inf 1 0.995 0.978 0.969 ...
cutoffs <- data.frame(cut = info_performance@alpha.values[[1]], 
                      fpr = info_performance@x.values[[1]], 
                      tpr = info_performance@y.values[[1]])
head(cutoffs)
##         cut          fpr          tpr
## 1       Inf 0.0000000000 0.0000000000
## 2 1.0000000 0.0000000000 0.0007976602
## 3 0.9948365 0.0000000000 0.0747141718
## 4 0.9778226 0.0003216727 0.1337410263
## 5 0.9685864 0.0006433454 0.1579367190
## 6 0.9673367 0.0012062726 0.2047327838
roc <- pROC::roc(newtest2$income, info_prob)
threshold <- coords(roc, "best", ret = "threshold")
cat("The best threshold is :  ", threshold, "\n")
## The best threshold is :   0.184489
#Get accuracy rate of testset data using the optimal threshold  ****
confusionMatrix(info_prob > threshold, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  9549  574
##      TRUE   2886 3187
##                                                
##                Accuracy : 0.7864               
##                  95% CI : (0.78, 0.7927)       
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : 0.000000008282       
##                                                
##                   Kappa : 0.5067               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.7679               
##             Specificity : 0.8474               
##          Pos Pred Value : 0.9433               
##          Neg Pred Value : 0.5248               
##              Prevalence : 0.7678               
##          Detection Rate : 0.5896               
##    Detection Prevalence : 0.6250               
##       Balanced Accuracy : 0.8076               
##                                                
##        'Positive' Class : FALSE                
## 

\(\\\)

\(\\\)

Compare ROC and AUC from three different tuned tree

set.seed(100)
#Compare ROC curve 
plot(pruned_performance, main = "ROC curve", col = "blue")
plot(gini_performance, add = TRUE, col = "red")
plot(tree_performance, add = TRUE, col = "green")
plot(info_performance, add = TRUE)
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Pruned - 1st method", "Tunned - 2nd method",
                                 "Tunned - 3rd method","unprunned"),
       col=c("blue", "red", "black", "green"), lwd=3, cex=.45, horiz = TRUE)

\(\\\)

\(\\\)

Pick the best threshold for each model which leads best accuracy

set.seed(100)
thresholds <- seq(from = 0.001, 0.999, 0.001)
accuracy <- c()



#Using train dataset to check new accuracy driven by  new threshold
gini_prob.train <- predict(dtree_fit, newdata = newtrain2, 
                           type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((gini_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres1 <- which.max(accuracy) * 0.001
thres1
## [1] 0.453
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(gini_prob > thres1, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11918  1747
##      TRUE    517  2014
##                                                
##                Accuracy : 0.8602               
##                  95% CI : (0.8548, 0.8655)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5575               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9584               
##             Specificity : 0.5355               
##          Pos Pred Value : 0.8722               
##          Neg Pred Value : 0.7957               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7359               
##    Detection Prevalence : 0.8437               
##       Balanced Accuracy : 0.7470               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
prunned.gini.accuracy <- mean((gini_prob > thres1) == (newtest2$income == ">50K"))

#Test accuracy rate by using default threshold(0.5)
prunned.gini.accuracy.half <- mean((gini_prob > 0.5) == (newtest2$income == ">50K"))



#==================================================================



#Using train dataset to check new accuracy driven by  new threshold
info_prob.train <- predict(dtree_fit_info, newdata = newtrain2, 
                           type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i]  <- mean((info_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres2 <- which.max(accuracy) * 0.001
thres2
## [1] 0.422
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(info_prob > thres2, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11767  1585
##      TRUE    668  2176
##                                                
##                Accuracy : 0.8609               
##                  95% CI : (0.8555, 0.8662)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5736               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9463               
##             Specificity : 0.5786               
##          Pos Pred Value : 0.8813               
##          Neg Pred Value : 0.7651               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7265               
##    Detection Prevalence : 0.8244               
##       Balanced Accuracy : 0.7624               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
prunned.info.accuracy <- mean((info_prob > thres2) == (newtest2$income == ">50K"))

#Test accuracy rate by using default threshold(0.5)
prunned.info.accuracy.half <- mean((info_prob > 0.5) == (newtest2$income == ">50K"))

#==================================================================



#Using train dataset to check new accuracy driven by new threshold
tree_prob.train <- predict(tree, newdata = newtrain2, type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((tree_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres3 <- which.max(accuracy) * 0.001
thres3
## [1] 0.367
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(tree_prob > thres3, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11859  1825
##      TRUE    576  1936
##                                                
##                Accuracy : 0.8518               
##                  95% CI : (0.8462, 0.8572)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5298               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9537               
##             Specificity : 0.5148               
##          Pos Pred Value : 0.8666               
##          Neg Pred Value : 0.7707               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7322               
##    Detection Prevalence : 0.8449               
##       Balanced Accuracy : 0.7342               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
unprunned.accuracy <- mean((tree_prob > thres3) == (newtest2$income == ">50K"))

#Test accuracy rate by using optimal threshold
unprunned.accuracy.half <- mean((tree_prob > 0.5) == (newtest2$income == ">50K"))

#==================================================================



#Using train dataset to check new accuracy driven by new threshold
pruned_prob.train <- predict(treepruned, newdata = newtrain2, 
                             type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((pruned_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres4 <- which.max(accuracy) * 0.001
thres4
## [1] 0.367
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get accuracy rate of testset data using the optimal threshold
confusionMatrix(pruned_prob > thres4, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11872  1837
##      TRUE    563  1924
##                                                
##                Accuracy : 0.8518               
##                  95% CI : (0.8462, 0.8573)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5288               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9547               
##             Specificity : 0.5116               
##          Pos Pred Value : 0.8660               
##          Neg Pred Value : 0.7736               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7330               
##    Detection Prevalence : 0.8464               
##       Balanced Accuracy : 0.7331               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
prunned.accuracy <- mean((pruned_prob > thres4) == (newtest2$income == ">50K"))

prunned.accuracy.half <- mean((pruned_prob > 0.5) == (newtest2$income == ">50K"))

Compare Accuracy & AUC

set.seed(100)
#Compare AUC
auc <- data.frame(pruned.auc, info.auc, gini.auc, tree.auc)
auc[, order(auc)]
##   pruned.auc  gini.auc info.auc  tree.auc
## 1   0.858427 0.8718559 0.872279 0.8768653
#Pick the model with the largest AUC - unprunned tree
final.auc1 <- tree


#Compare Accuracy - optimal threshold
accuracy.tree.df <- data.frame(unprunned.accuracy, prunned.accuracy,
                          prunned.gini.accuracy, prunned.info.accuracy)
accuracy.tree.df[, order(accuracy.tree.df)]
##   unprunned.accuracy prunned.accuracy prunned.gini.accuracy
## 1          0.8517535        0.8518153             0.8602124
##   prunned.info.accuracy
## 1             0.8608916
#Pick the model with the highest Accuracy - prunned.info.accuracy
final.thres1 <- dtree_fit_info


#Compare Accuracy - default threshold (0.5) 
accuracy.tree.df.half <- data.frame(unprunned.accuracy.half,
                                    prunned.accuracy.half,
                                    prunned.gini.accuracy.half,
                                    prunned.info.accuracy.half)

accuracy.tree.df.half[, order(accuracy.tree.df.half)] 
##   unprunned.accuracy.half prunned.accuracy.half prunned.gini.accuracy.half
## 1               0.8517535             0.8518153                  0.8602124
##   prunned.info.accuracy.half
## 1                  0.8608916
#Pick the model with the highest Accuracy - - prunned.info.accuracy
final.thres1.half <- dtree_fit_info

Comment:

We found the fact that when we apply the threshold which gives us maximum accuracy rate for the training datset to predicting the test dataset, we will eventually end up with the same accuracy rate that we got from the test dataset.

We picked the unprunned tree model from AUC model selection and tunned model by information gain(cross entropy) from Accuracy rate.

Bagged Tree - simply a special case of a random forest with m = p.

Train and tune a Bagged classifier

set.seed(100)



#=============================================================



#Create a task
traintask <- makeClassifTask(data = newtrain2, target = "income", positive = ">50K")
testtask <- makeClassifTask(data = newtest2, target = "income", positive = ">50K")



#Brief view of trainTask
traintask
## Supervised task: newtrain2
## Type: classif
## Target: income
## Observations: 32402
## Features:
## numerics  factors  ordered 
##       43        0        0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 2
## <=50K  >50K 
## 24720  7682 
## Positive class: >50K
#For deeper View
str(getTaskData(traintask))
## 'data.frame':    32402 obs. of  44 variables:
##  $ age                  : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ education.num        : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ capital.gain         : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ Local.gov            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.gain              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : int  0 0 1 1 1 1 1 0 1 1 ...
##  $ Self.emp.inc         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self.emp.not.inc     : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ State.gov            : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Married.AF.spouse    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married.civ.spouse   : int  0 1 0 1 1 1 0 1 0 1 ...
##  $ Married.spouse.absent: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Never.married        : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Separated            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed.Forces         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft.repair         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Exec.managerial      : int  0 1 0 0 0 1 0 1 0 1 ...
##  $ Farming.fishing      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handlers.cleaners    : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ Machine.op.inspct    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other.service        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Priv.house.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof.specialty       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ Protective.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tech.support         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport.moving     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not.in.family        : int  1 0 1 0 0 0 1 0 1 0 ...
##  $ Other.relative       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own.child            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unmarried            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Wife                 : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ Asian.Pac.Islander   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : int  0 0 0 1 1 0 1 0 0 0 ...
##  $ Other                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : int  1 1 1 0 0 1 0 1 1 1 ...
##  $ Male                 : int  1 1 1 1 0 0 0 1 0 1 ...
##  $ other_countries      : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ Philippines          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ United.States        : int  1 1 1 1 0 1 0 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
#Create a bagging learner
bagged <- makeLearner("classif.rpart",  parms = list(split = "gini"),
                      predict.type = "response")



#Set up the bagging algorithm which will grow 100 trees on randomized samples of data with replacement.
bag <- makeBaggingWrapper(learner = bagged, bw.iters = 100, bw.replace = TRUE)
# Q  :  bw.iters [integer(1)] Iterations = number of fitted models in bagging. Default is 10



#To check the performance, set up a validation strategy
#set 3 fold cross validation
rdesc <- makeResampleDesc("CV", iters = 3L)



# With 100 trees, bagging has returned an accuracy of 84.5%
r <- resample(learner = bag , task = traintask, resampling = rdesc, 
              measures = list(tpr, fpr, fnr, tnr, acc), show.info = T)
## [Resample] cross-validation iter 1: tpr.test.mean=0.522,fpr.test.mean=0.0512,fnr.test.mean=0.478,tnr.test.mean=0.949,acc.test.mean=0.846
## [Resample] cross-validation iter 2: tpr.test.mean=0.528,fpr.test.mean=0.0549,fnr.test.mean=0.472,tnr.test.mean=0.945,acc.test.mean=0.849
## [Resample] cross-validation iter 3: tpr.test.mean=0.494,fpr.test.mean=0.0526,fnr.test.mean=0.506,tnr.test.mean=0.947,acc.test.mean=0.839
## [Resample] Aggr. Result: tpr.test.mean=0.515,fpr.test.mean=0.0529,fnr.test.mean=0.485,tnr.test.mean=0.947,acc.test.mean=0.845
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from bagged model
r
## Resample Result
## Task: newtrain2
## Learner: classif.rpart.bagged
## Aggr perf: tpr.test.mean=0.515,fpr.test.mean=0.0529,fnr.test.mean=0.485,tnr.test.mean=0.947,acc.test.mean=0.845
## Runtime: 117.164
#Aggr. Result: tpr.test.mean=0.514,fpr.test.mean=0.0554,fnr.test.mean=0.486,tnr.test.mean=0.945,acc.test.mean=0.843



#=============================================================



#Make a random bagged learner (mtry = number of variables in dataset)
bag.rf <- makeLearner("classif.randomForest", predict.type = "response",
                  par.vals = list(ntree = 50L, mtry = 43, 
                                  importance = TRUE))



r2 <- resample(learner = bag.rf, task = traintask, resampling = rdesc, 
               measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.626,fpr.test.mean=0.0816,fnr.test.mean=0.374,tnr.test.mean=0.918,acc.test.mean=0.849
## [Resample] cross-validation iter 2: tpr.test.mean=0.613,fpr.test.mean=0.088,fnr.test.mean=0.387,tnr.test.mean=0.912,acc.test.mean=0.84
## [Resample] cross-validation iter 3: tpr.test.mean=0.628,fpr.test.mean=0.0824,fnr.test.mean=0.372,tnr.test.mean=0.918,acc.test.mean=0.85
## [Resample] Aggr. Result: tpr.test.mean=0.622,fpr.test.mean=0.084,fnr.test.mean=0.378,tnr.test.mean=0.916,acc.test.mean=0.846
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r2
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.622,fpr.test.mean=0.084,fnr.test.mean=0.378,tnr.test.mean=0.916,acc.test.mean=0.846
## Runtime: 64.1211
#Aggr perf: tpr.test.mean=0.636,fpr.test.mean=0.0883,fnr.test.mean=0.364,tnr.test.mean=0.912,acc.test.mean=0.846



#Internally, random forest uses a cutoff of 0.5  --> 
#if a particular unseen observation has a probability higher than 0.5, it will be classified as >50K.
#In random forest, we have the option to customize the internal cutoff. As the false negative rate is very high now, we'll increase the cutoff for negative classes (<=50K) and accordingly reduce it for positive classes (>50K). Then, train the model again.



#Evaluating by using new cutoff
bag.rf$par.vals <- list(ntree = 50L, mtry = 43, importance = TRUE, cutoff = c(0.55, 0.45))
r3 <- resample(learner = bag.rf, task = traintask, resampling = rdesc, 
              measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.647,fpr.test.mean=0.103,fnr.test.mean=0.353,tnr.test.mean=0.897,acc.test.mean=0.84
## [Resample] cross-validation iter 2: tpr.test.mean=0.69,fpr.test.mean=0.102,fnr.test.mean=0.31,tnr.test.mean=0.898,acc.test.mean=0.848
## [Resample] cross-validation iter 3: tpr.test.mean=0.659,fpr.test.mean=0.102,fnr.test.mean=0.341,tnr.test.mean=0.898,acc.test.mean=0.84
## [Resample] Aggr. Result: tpr.test.mean=0.665,fpr.test.mean=0.103,fnr.test.mean=0.335,tnr.test.mean=0.897,acc.test.mean=0.842
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r3
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.665,fpr.test.mean=0.103,fnr.test.mean=0.335,tnr.test.mean=0.897,acc.test.mean=0.842
## Runtime: 59.9893
#Aggr perf: tpr.test.mean=0.636,fpr.test.mean=0.0646,fnr.test.mean=0.364,tnr.test.mean=0.935,acc.test.mean=0.864   ---> we can see that false negative rate is decreased even though the accuracy rate stays the same. I have tried cutoff = c(0.6, 0.4), cutoff = c(0.7, 0.3) but they all gave lower accuracy late.



#========================================================================



#Let's see how the test classification error changes as we increase the number of trees for untunned model  ( #number of trees VS test classification error)


#Train a old untunned model
untunnedbagged <- mlr::train(bag.rf, traintask)

bag.untunned_ind <- predict(untunnedbagged$learner.model, newtrain2, 
                    predict.all = T)$individual
head(bag.untunned_ind, 2)
##   [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]    [,9]   
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
##   [,10]   [,11]   [,12]   [,13]   [,14]   [,15]   [,16]   [,17]   [,18]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
##   [,19]   [,20]   [,21]   [,22]   [,23]   [,24]   [,25]   [,26]   [,27]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" 
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
##   [,28]   [,29]   [,30]   [,31]   [,32]   [,33]   [,34]   [,35]   [,36]  
## 1 "<=50K" ">50K"  "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
##   [,37]   [,38]   [,39]   [,40]   [,41]   [,42]   [,43]   [,44]   [,45] 
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"
##   [,46]   [,47]   [,48]   [,49]   [,50]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
n <- dim(bag.untunned_ind)[1]
m <- ceiling(dim(bag.untunned_ind)[2] / 2)
predicted_ind <- c()
misclass.ind <- c()

for(i in 1:m){   # number of tree
  for(j in 1:n){
    predicted_ind[j] <- names(which.max(table(bag.untunned_ind[j, 1:i*2-1])))
  }
  misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}

bag.untunned.df <- data.frame(misclass.ind, ntree = seq(1, 50, 2))

ggplot(bag.untunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
  ggtitle("Number of trees vs Misclassification rate in training dataset - untunned bagged model")

#======================== Let's actually tune the hyperparameters



#Bagged tree tuning
getParamSet(bag.rf)
##                      Type  len   Def   Constr Req Tunable Trafo
## ntree             integer    -   500 1 to Inf   -    TRUE     -
## mtry              integer    -     - 1 to Inf   -    TRUE     -
## replace           logical    -  TRUE        -   -    TRUE     -
## classwt     numericvector <NA>     - 0 to Inf   -    TRUE     -
## cutoff      numericvector <NA>     -   0 to 1   -    TRUE     -
## strata            untyped    -     -        -   -   FALSE     -
## sampsize    integervector <NA>     - 1 to Inf   -    TRUE     -
## nodesize          integer    -     1 1 to Inf   -    TRUE     -
## maxnodes          integer    -     - 1 to Inf   -    TRUE     -
## importance        logical    - FALSE        -   -    TRUE     -
## localImp          logical    - FALSE        -   -    TRUE     -
## proximity         logical    - FALSE        -   -   FALSE     -
## oob.prox          logical    -     -        -   Y   FALSE     -
## norm.votes        logical    -  TRUE        -   -   FALSE     -
## do.trace          logical    - FALSE        -   -   FALSE     -
## keep.forest       logical    -  TRUE        -   -   FALSE     -
## keep.inbag        logical    - FALSE        -   -   FALSE     -
#Specifying the search space for hyperparameters
bag.rf_params <- makeParamSet(makeIntegerParam("nodesize", 
                                           lower = 10, upper = 50),
                          makeIntegerParam("ntree", lower = 3, upper = 100))



#Set validation strategy
rdesc <- makeResampleDesc("CV", iters = 3L)



#Set optimization technique
bag.rf_ctrl <- makeTuneControlRandom(maxit = 5L)



#Start Hypertuning the parameters
bag.rf_tune <- tuneParams(learner = bag.rf, task = traintask, 
                          resampling = rdesc,
                   measures = list(acc), par.set = bag.rf_params,
                   control = bag.rf_ctrl, show.info = TRUE)
## [Tune] Started tuning learner classif.randomForest for parameter set:
##             Type len Def   Constr Req Tunable Trafo
## nodesize integer   -   - 10 to 50   -    TRUE     -
## ntree    integer   -   - 3 to 100   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## [Tune-x] 1: nodesize=21; ntree=24
## [Tune-y] 1: acc.test.mean=0.857; time: 0.5 min
## [Tune-x] 2: nodesize=17; ntree=51
## [Tune-y] 2: acc.test.mean=0.856; time: 1.0 min
## [Tune-x] 3: nodesize=17; ntree=66
## [Tune-y] 3: acc.test.mean=0.858; time: 1.3 min
## [Tune-x] 4: nodesize=43; ntree=68
## [Tune-y] 4: acc.test.mean=0.863; time: 1.2 min
## [Tune-x] 5: nodesize=29; ntree=52
## [Tune-y] 5: acc.test.mean=0.861; time: 0.9 min
## [Tune] Result: nodesize=43; ntree=68 : acc.test.mean=0.863
#Optimal hypertuned parameters
bag.rf_tune$x
## $nodesize
## [1] 43
## 
## $ntree
## [1] 68
#Accuracy rate from Cross Validation
bag.rf_tune$y
## acc.test.mean 
##     0.8625703
#Use hyperparameters for modeling
bag.rf_tree <- setHyperPars(bag.rf, par.vals = bag.rf_tune$x)



#Train a model
bag.rforest <- mlr::train(bag.rf_tree, traintask)
getLearnerModel(bag.rforest)
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L) 
##                Type of random forest: classification
##                      Number of trees: 68
## No. of variables tried at each split: 43
## 
##         OOB estimate of  error rate: 13.66%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 22960 1760  0.07119741
## >50K   2666 5016  0.34704504
#***Make plots for random forest model



#========================================================================



#Let's see how the test classification error changes as we increase the number of trees for tunned model  ( #number of trees VS test classification error)
bag.tunned_ind <- predict(bag.rforest$learner.model, newtrain2, 
                    predict.all = T)$individual
head(bag.tunned_ind, 2)
##   [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]    [,9]   
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
##   [,10]   [,11]   [,12]   [,13]   [,14]   [,15]   [,16]   [,17]   [,18]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" ">50K" 
##   [,19]   [,20]   [,21]   [,22]   [,23]   [,24]   [,25]   [,26]   [,27]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" ">50K"  "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
##   [,28]   [,29]   [,30]   [,31]   [,32]   [,33]   [,34]   [,35]   [,36]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K"
##   [,37]   [,38]   [,39]   [,40]   [,41]   [,42]   [,43]   [,44]   [,45]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" "<=50K" "<=50K"
##   [,46]   [,47]   [,48]   [,49]   [,50]   [,51]   [,52]   [,53]   [,54]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K" 
## 2 "<=50K" ">50K"  "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" "<=50K"
##   [,55]   [,56]   [,57]   [,58]   [,59]   [,60]   [,61]   [,62]   [,63]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" ">50K"  ">50K"  "<=50K" "<=50K"
##   [,64]   [,65]   [,66]   [,67]   [,68]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K"  "<=50K"
n <- dim(bag.tunned_ind)[1]
m <- ceiling(dim(bag.tunned_ind)[2] / 2)
predicted_ind <- c()
misclass.ind <- c()

for(i in 1:m){   # number of tree
  for(j in 1:n){
    predicted_ind[j] <- names(which.max(table(bag.tunned_ind[j, 1:i*2-1])))
  }
  misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}

bag.tunned.df <- data.frame(misclass.ind, ntree = seq(1, 68, 2))

ggplot(bag.tunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
  ggtitle("Number of trees vs Misclassification rate in training dataset - tunned bagged model")

#Variable importance statistics
varImpPlot(bag.rforest$learner.model)

importance(bag.rforest$learner.model)
##                               <=50K        >50K MeanDecreaseAccuracy
## age                   -13.022404619  67.4753667          40.16504758
## education.num          14.906765795  60.3634774          52.75366374
## capital.gain           95.990371633 100.2240681         120.62875611
## capital.loss           29.881885811  68.2701358          50.74005081
## hours.per.week         -5.344224745  37.2771623          27.71506590
## Local.gov               7.904425737  -0.9551792           5.76331191
## No.gain                 0.000000000   0.0000000           0.00000000
## Private                 8.298374351  -1.9894435           7.92407861
## Self.emp.inc            4.409166626   1.2369293           4.21427059
## Self.emp.not.inc       17.765098169   0.7731384          17.40876070
## State.gov               7.599717792  -3.5201521           4.82212848
## Married.AF.spouse      -1.899616305   4.0462318           1.74721957
## Married.civ.spouse     24.951055410  74.4942455          85.35353206
## Married.spouse.absent  -3.587531903   0.2389463          -3.40144767
## Never.married           3.141743832  -1.8860144           3.03550001
## Separated              -0.311222932   0.5310407          -0.06140906
## Widowed                 1.024164894  -0.1165945           0.89627638
## Armed.Forces            0.000000000   0.0000000           0.00000000
## Craft.repair            2.512301411   3.4041561           7.09837298
## Exec.managerial         2.503491752  14.2502558          17.78909167
## Farming.fishing         7.289129670   7.6609658          12.85222437
## Handlers.cleaners       1.601077058  10.3739954          10.56789812
## Machine.op.inspct       2.463727194   8.0450045          10.09705795
## Other.service         -13.238964972  18.1797187          16.11203961
## Priv.house.serv        -1.007435047   0.0000000          -1.00743505
## Prof.specialty          9.325886134   6.3193696          14.03067808
## Protective.serv        -1.154071066   7.6527703           6.40478480
## Sales                  -3.099394496   4.4816547           2.81916174
## Tech.support            3.615391280  15.3277499          15.12281346
## Transport.moving        3.540878339   4.1488966           6.77990757
## Not.in.family           4.560014618   2.4358485           7.83037710
## Other.relative         -2.733742632   9.1867657           7.43418019
## Own.child               4.746791011   3.7476768           8.50872528
## Unmarried              -0.964675986  -0.7981551          -1.19627697
## Wife                    3.439577110  15.5303412           5.61330163
## Asian.Pac.Islander      4.879102564  -0.2367667           4.42374634
## Black                   1.388736810   5.7893184           6.44067514
## Other                   0.007983821   1.9455020           1.15310924
## White                  -1.021156086   4.9407330           3.27693944
## Male                    9.362764847   1.2051294          11.47695072
## other_countries         3.160924928   0.1052195           2.58749260
## Philippines            -2.351414987   2.4893630           0.31596858
## United.States           2.384444995   1.2419303           2.90132520
##                       MeanDecreaseGini
## age                       597.09980328
## education.num            1313.20657000
## capital.gain             1123.93910840
## capital.loss              394.03545718
## hours.per.week            403.69902059
## Local.gov                  26.62121430
## No.gain                     0.00000000
## Private                    34.85345811
## Self.emp.inc               31.42607290
## Self.emp.not.inc           68.42861618
## State.gov                  25.36534708
## Married.AF.spouse           5.55656634
## Married.civ.spouse       2290.97129831
## Married.spouse.absent       3.77466788
## Never.married               8.04425050
## Separated                   4.45930282
## Widowed                     7.00353256
## Armed.Forces                0.00000000
## Craft.repair               27.35518124
## Exec.managerial            95.96340692
## Farming.fishing            36.11000707
## Handlers.cleaners          23.47151631
## Machine.op.inspct          26.91537528
## Other.service              41.44214064
## Priv.house.serv             0.08155885
## Prof.specialty             48.95095629
## Protective.serv            22.82066641
## Sales                      35.54876962
## Tech.support               47.94626370
## Transport.moving           31.43454712
## Not.in.family               9.78332366
## Other.relative              8.17089443
## Own.child                   7.44808214
## Unmarried                   3.26653970
## Wife                       42.86895624
## Asian.Pac.Islander         18.52902669
## Black                      20.19454254
## Other                       6.77053011
## White                      24.72990804
## Male                       37.61097060
## other_countries            19.37735451
## Philippines                 9.10272631
## United.States              16.12389600

\(\\\)

\(\\\)

set.seed(100)
# ** Plot bagged tree



# ** Make predictions on training dataset
bag.rfclass1 <- predict(bag.rforest, traintask)



#Confusion matrix on training dataset
confusionMatrix(bag.rfclass1$data$response, bag.rfclass1$data$truth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 23414  2203
##      >50K   1306  5479
##                                                
##                Accuracy : 0.8917               
##                  95% CI : (0.8883, 0.8951)     
##     No Information Rate : 0.7629               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6881               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9472               
##             Specificity : 0.7132               
##          Pos Pred Value : 0.9140               
##          Neg Pred Value : 0.8075               
##              Prevalence : 0.7629               
##          Detection Rate : 0.7226               
##    Detection Prevalence : 0.7906               
##       Balanced Accuracy : 0.8302               
##                                                
##        'Positive' Class : <=50K                
## 
#Make random forest plots on training dataset
plot(bag.rfclass1$data$response, newtrain2$income)
abline(0, 1)

#Training accuracy rate
1 - mean(bag.rfclass1$data$response != newtrain2$income)
## [1] 0.8917042
#Make predictions on test dataset
bag.rfclass2 <- predict(bag.rforest, testtask)



#Confusion matrix on test dataset
confusionMatrix(bag.rfclass2$data$response, bag.rfclass2$data$truth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11504  1340
##      >50K    931  2421
##                                                
##                Accuracy : 0.8598               
##                  95% CI : (0.8543, 0.8651)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5913               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9251               
##             Specificity : 0.6437               
##          Pos Pred Value : 0.8957               
##          Neg Pred Value : 0.7223               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7103               
##    Detection Prevalence : 0.7930               
##       Balanced Accuracy : 0.7844               
##                                                
##        'Positive' Class : <=50K                
## 
#Make random forest plots on test dataset
plot(bag.rfclass2$data$response, newtest2$income)
abline(0, 1)

#Test accuracy rate
1 - mean(bag.rfclass2$data$response != newtest2$income)
## [1] 0.8597802

\(\\\)

\(\\\)

ROC and AUC

set.seed(100)
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Untunned bagged tree model
#Getting predicted >50K of income probabilities 
untunned.bag.rf <- mlr::train(bag.rf, traintask)
untunned.bag.rf_prob <- predict(untunned.bag.rf$learner.model,
                            newdata = newtest2, type = "prob")[, 2]
untunned.bag.rf_prediction <- prediction(untunned.bag.rf_prob,
                                         newtest2$income)
untunned.bag.rf_performance <- ROCR::performance(untunned.bag.rf_prediction,
                                                 measure = "tpr", 
                                                 x.measure = "fpr")



#Plot ROC curve 
plot(untunned.bag.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
untunned.bag.rf.auc <- ROCR::performance(untunned.bag.rf_prediction,
                                     measure = "auc")@y.values[[1]]
untunned.bag.rf.auc
## [1] 0.8817957
#=====================================================================



#Tunned bagged tree model
#Getting predicted >50K of income probabilities 
tunned.bag.rf_prob <- predict(bag.rforest$learner.model, newdata = newtest2,
                     type = "prob")[, 2]
tunned.bag.rf_prediction <- prediction(tunned.bag.rf_prob, newtest2$income)
tunned.bag.rf_performance <- ROCR::performance(tunned.bag.rf_prediction,
                                               measure = "tpr",
                                               x.measure = "fpr")



#Plot ROC curve 
plot(tunned.bag.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
tunned.bag.rf.auc <- ROCR::performance(tunned.bag.rf_prediction,
                                   measure = "auc")@y.values[[1]]
tunned.bag.rf.auc
## [1] 0.8942506

\(\\\)

\(\\\)

Compare ROC and AUC from three different tuned tree

set.seed(100)
#Compare ROC curve 
plot(tunned.bag.rf_performance, main = "ROC curve", col = "blue")
plot(untunned.bag.rf_performance, add = TRUE, col = "red")
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Tunned", "Untunned"), col = c("blue", "red"), lwd=3, cex=.8, horiz = TRUE)

#Compare AUC
auc <- data.frame(tunned.bag.rf.auc, untunned.bag.rf.auc)
auc[, order(auc)]
##   untunned.bag.rf.auc tunned.bag.rf.auc
## 1           0.8817957         0.8942506
#Pick the model with the largest AUC --> tunned bagged tree
final.auc2 <- bag.rforest$learner.model

\(\\\)

\(\\\)

Pick the best threshold for each model which leads best accuracy

set.seed(100)
thresholds <- seq(from = 0.001, 0.999, 0.001)
accuracy <- c()



#==================================================================



#Using train dataset to check new accuracy driven by  new threshold
untunned.bag.rf_prob.train <- predict(untunned.bag.rf$learner.model,
                            newdata = newtrain2, type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((untunned.bag.rf_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres1 <- which.max(accuracy) * 0.001
thres1
## [1] 0.46
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(untunned.bag.rf_prob > thres1, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11235  1392
##      TRUE   1200  2369
##                                                
##                Accuracy : 0.84                 
##                  95% CI : (0.8342, 0.8456)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5431               
##  Mcnemar's Test P-Value : 0.0001757            
##                                                
##             Sensitivity : 0.9035               
##             Specificity : 0.6299               
##          Pos Pred Value : 0.8898               
##          Neg Pred Value : 0.6638               
##              Prevalence : 0.7678               
##          Detection Rate : 0.6937               
##    Detection Prevalence : 0.7796               
##       Balanced Accuracy : 0.7667               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
untunned.bagged.accuracy <- mean((untunned.bag.rf_prob > thres1) == (newtest2$income == ">50K"))



#compare the test accuracy by using default threshold (0.5)
thres.untunned.bag.half <- mean((untunned.bag.rf_prob > 0.5) == (newtest2$income == ">50K")) 



#==================================================================



#Using train dataset to check new accuracy driven by  new threshold
tunned.bag.rf_prob.train <- predict(bag.rforest$learner.model,
                            newdata = newtrain2, type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((tunned.bag.rf_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres2 <- which.max(accuracy) * 0.001
thres2
## [1] 0.442
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(tunned.bag.rf_prob > thres2, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11504  1340
##      TRUE    931  2421
##                                                
##                Accuracy : 0.8598               
##                  95% CI : (0.8543, 0.8651)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5913               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9251               
##             Specificity : 0.6437               
##          Pos Pred Value : 0.8957               
##          Neg Pred Value : 0.7223               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7103               
##    Detection Prevalence : 0.7930               
##       Balanced Accuracy : 0.7844               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
tunned.bagged.accuracy <- mean((tunned.bag.rf_prob > thres2) == (newtest2$income == ">50K"))



#compare the test accuracy by using default threshold (0.5)
thres.tunned.bag.half <- mean((tunned.bag.rf_prob > 0.5) == (newtest2$income == ">50K"))

Compare Accuracy & AUC

set.seed(100)

#Compare AUC
auc <- data.frame(tunned.bag.rf.auc, untunned.bag.rf.auc)
auc[, order(auc)]
##   untunned.bag.rf.auc tunned.bag.rf.auc
## 1           0.8817957         0.8942506
#Pick the model with the largest AUC --> tunned bagged tree
final.auc2 <- bag.rforest$learner.model


#Compare Accuracy - optimal threshold
accuracy.bag.df <- data.frame(tunned.bagged.accuracy,
                              untunned.bagged.accuracy)
accuracy.bag.df[, order(accuracy.bag.df)]
##   untunned.bagged.accuracy tunned.bagged.accuracy
## 1                0.8399605              0.8597802
#Pick the model with the highest Accuracy  - tunned.bag.rf.auc
final.thres2 <- bag.rforest$learner.model


#Compare Accuracy - 0.5 threshold
accuracy.bag.df.half <- data.frame(thres.untunned.bag.half,
                              thres.tunned.bag.half)
accuracy.bag.df.half[, order(accuracy.bag.df.half)]
##   thres.untunned.bag.half thres.tunned.bag.half
## 1               0.8449617             0.8620647
#Pick the model with the highest Accuracy  - tunned.bag.rf.auc
final.thres2.half <- bag.rforest$learner.model

Random Forest

Train and tune a Random Forest classifier

set.seed(100)

#=============================================================



#Create a task
traintask <- makeClassifTask(data = newtrain2, target = "income", positive = ">50K")
testtask <- makeClassifTask(data = newtest2, target = "income", positive = ">50K")



#Brief view of trainTask
traintask
## Supervised task: newtrain2
## Type: classif
## Target: income
## Observations: 32402
## Features:
## numerics  factors  ordered 
##       43        0        0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Classes: 2
## <=50K  >50K 
## 24720  7682 
## Positive class: >50K
#For deeper View
str(getTaskData(traintask))
## 'data.frame':    32402 obs. of  44 variables:
##  $ age                  : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ education.num        : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ capital.gain         : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ Local.gov            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.gain              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : int  0 0 1 1 1 1 1 0 1 1 ...
##  $ Self.emp.inc         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self.emp.not.inc     : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ State.gov            : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Married.AF.spouse    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married.civ.spouse   : int  0 1 0 1 1 1 0 1 0 1 ...
##  $ Married.spouse.absent: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Never.married        : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ Separated            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed.Forces         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft.repair         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Exec.managerial      : int  0 1 0 0 0 1 0 1 0 1 ...
##  $ Farming.fishing      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handlers.cleaners    : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ Machine.op.inspct    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other.service        : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Priv.house.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof.specialty       : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ Protective.serv      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tech.support         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport.moving     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not.in.family        : int  1 0 1 0 0 0 1 0 1 0 ...
##  $ Other.relative       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own.child            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unmarried            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Wife                 : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ Asian.Pac.Islander   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : int  0 0 0 1 1 0 1 0 0 0 ...
##  $ Other                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : int  1 1 1 0 0 1 0 1 1 1 ...
##  $ Male                 : int  1 1 1 1 0 0 0 1 0 1 ...
##  $ other_countries      : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ Philippines          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ United.States        : int  1 1 1 1 0 1 0 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
#=============================================================



#Make a random forest learner
rf <- makeLearner("classif.randomForest", predict.type = "response",
                  par.vals = list(ntree = 50L, importance = TRUE))



#To check the performance, set up a validation strategy
#set 3 fold cross validation
rdesc <- makeResampleDesc("CV", iters = 3L)



r2 <- resample(learner = rf, task = traintask, resampling = rdesc, 
               measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.632,fpr.test.mean=0.0573,fnr.test.mean=0.368,tnr.test.mean=0.943,acc.test.mean=0.868
## [Resample] cross-validation iter 2: tpr.test.mean=0.635,fpr.test.mean=0.0603,fnr.test.mean=0.365,tnr.test.mean=0.94,acc.test.mean=0.869
## [Resample] cross-validation iter 3: tpr.test.mean=0.602,fpr.test.mean=0.0581,fnr.test.mean=0.398,tnr.test.mean=0.942,acc.test.mean=0.861
## [Resample] Aggr. Result: tpr.test.mean=0.623,fpr.test.mean=0.0586,fnr.test.mean=0.377,tnr.test.mean=0.941,acc.test.mean=0.866
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r2
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.623,fpr.test.mean=0.0586,fnr.test.mean=0.377,tnr.test.mean=0.941,acc.test.mean=0.866
## Runtime: 49.0401
#Aggr. Result: tpr.test.mean=0.623,fpr.test.mean=0.0598,fnr.test.mean=0.377,tnr.test.mean=0.94,acc.test.mean=0.865



#Internally, random forest uses a cutoff of 0.5  --> 
#if a particular unseen observation has a probability higher than 0.5, it will be classified as >50K.
#In random forest, we have the option to customize the internal cutoff. As the false negative rate is very high now, we'll increase the cutoff for negative classes (<=50K) and accordingly reduce it for positive classes (>50K). Then, train the model again.



#Evaluating by using new cutoff
rf$par.vals <- list(ntree = 50L, importance = TRUE, cutoff = c(0.53, 0.47))
r3 <- resample(learner = rf, task = traintask, resampling = rdesc, 
              measures = list(tpr,fpr,fnr,tnr,acc), show.info = TRUE)
## [Resample] cross-validation iter 1: tpr.test.mean=0.624,fpr.test.mean=0.0647,fnr.test.mean=0.376,tnr.test.mean=0.935,acc.test.mean=0.862
## [Resample] cross-validation iter 2: tpr.test.mean=0.651,fpr.test.mean=0.0644,fnr.test.mean=0.349,tnr.test.mean=0.936,acc.test.mean=0.868
## [Resample] cross-validation iter 3: tpr.test.mean=0.66,fpr.test.mean=0.068,fnr.test.mean=0.34,tnr.test.mean=0.932,acc.test.mean=0.867
## [Resample] Aggr. Result: tpr.test.mean=0.645,fpr.test.mean=0.0657,fnr.test.mean=0.355,tnr.test.mean=0.934,acc.test.mean=0.866
#Show true positive rate, false positive rate, false negative rate, false positive rate, and accuracy rate from random forest model
r3
## Resample Result
## Task: newtrain2
## Learner: classif.randomForest
## Aggr perf: tpr.test.mean=0.645,fpr.test.mean=0.0657,fnr.test.mean=0.355,tnr.test.mean=0.934,acc.test.mean=0.866
## Runtime: 47.6331
#Aggr. Result: tpr.test.mean=0.651,fpr.test.mean=0.0683,fnr.test.mean=0.349,tnr.test.mean=0.932,acc.test.mean=0.865    ---> we can see that false negative rate is decreased even though the accuracy rate stays the same. I have tried cutoff = c(0.6, 0.4), cutoff = c(0.7, 0.3) but they all gave lower accuracy late.



#========================================================================



#Random Forest tuning



#Train a old untunned model
untunnedforest <- mlr::train(rf, traintask)



#Let's see how the test classification error changes as we increase the number of trees for untunned model   ( #number of trees VS test classification error)

rf.untunned_ind <- predict(untunnedforest$learner.model, newtrain2, 
                    predict.all = T)$individual

head(rf.untunned_ind,2)
##   [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]    [,9]   
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K"  "<=50K" ">50K"  "<=50K" ">50K"  "<=50K" "<=50K" "<=50K" "<=50K"
##   [,10]   [,11]   [,12]   [,13]   [,14]   [,15]   [,16]   [,17]   [,18]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K"  "<=50K" ">50K"  "<=50K" ">50K"  "<=50K" "<=50K" ">50K"  ">50K" 
##   [,19]   [,20]   [,21]   [,22]   [,23]   [,24]   [,25]   [,26]   [,27]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K"  "<=50K" ">50K"  "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" ">50K" 
##   [,28]   [,29]   [,30]   [,31]   [,32]   [,33]   [,34]   [,35]   [,36]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" ">50K"  ">50K" 
##   [,37]   [,38]   [,39]   [,40]   [,41]   [,42]   [,43]   [,44]   [,45]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K"  "<=50K" ">50K"  "<=50K" ">50K"  "<=50K" "<=50K" ">50K"  ">50K" 
##   [,46]   [,47]   [,48]   [,49]   [,50]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K"  "<=50K" ">50K"  ">50K"  ">50K"
n <- dim(rf.untunned_ind)[1]
m <- dim(rf.untunned_ind)[2] / 2
predicted_ind <- c()
misclass.ind <- c()

for(i in 1:m){   # number of tree
  for(j in 1:n){
    predicted_ind[j] <- names(which.max(table(rf.untunned_ind[j, 1:i*2-1])))
  }
  misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}

rf.untunned.df <- data.frame(misclass.ind, ntree = seq(1, 49, 2))

ggplot(rf.untunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
  ggtitle("Number of trees vs Misclassification rate in training dataset - untunned random forest model")

#======================== Let's actually tune the hyperparameters

getParamSet(rf)
##                      Type  len   Def   Constr Req Tunable Trafo
## ntree             integer    -   500 1 to Inf   -    TRUE     -
## mtry              integer    -     - 1 to Inf   -    TRUE     -
## replace           logical    -  TRUE        -   -    TRUE     -
## classwt     numericvector <NA>     - 0 to Inf   -    TRUE     -
## cutoff      numericvector <NA>     -   0 to 1   -    TRUE     -
## strata            untyped    -     -        -   -   FALSE     -
## sampsize    integervector <NA>     - 1 to Inf   -    TRUE     -
## nodesize          integer    -     1 1 to Inf   -    TRUE     -
## maxnodes          integer    -     - 1 to Inf   -    TRUE     -
## importance        logical    - FALSE        -   -    TRUE     -
## localImp          logical    - FALSE        -   -    TRUE     -
## proximity         logical    - FALSE        -   -   FALSE     -
## oob.prox          logical    -     -        -   Y   FALSE     -
## norm.votes        logical    -  TRUE        -   -   FALSE     -
## do.trace          logical    - FALSE        -   -   FALSE     -
## keep.forest       logical    -  TRUE        -   -   FALSE     -
## keep.inbag        logical    - FALSE        -   -   FALSE     -
#Specifying the search space for hyperparameters
rf_params <- makeParamSet(makeIntegerParam("mtry", lower = 2, upper = 10),
                       makeIntegerParam("nodesize", lower = 10, upper = 50),
                       makeIntegerParam("ntree", lower = 3, upper = 100)
                       )



#Set validation strategy
rdesc <- makeResampleDesc("CV", iters = 3L)



#Set optimization technique
rf_ctrl <- makeTuneControlRandom(maxit = 5L)



#Start Hypertuning the parameters
rf_tune <- tuneParams(learner = rf, task = traintask, resampling = rdesc,
                   measures = list(acc), par.set = rf_params,
                   control = rf_ctrl, show.info = TRUE)
## [Tune] Started tuning learner classif.randomForest for parameter set:
##             Type len Def   Constr Req Tunable Trafo
## mtry     integer   -   -  2 to 10   -    TRUE     -
## nodesize integer   -   - 10 to 50   -    TRUE     -
## ntree    integer   -   - 3 to 100   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: -0
## [Tune-x] 1: mtry=8; nodesize=14; ntree=79
## [Tune-y] 1: acc.test.mean=0.866; time: 1.2 min
## [Tune-x] 2: mtry=6; nodesize=23; ntree=31
## [Tune-y] 2: acc.test.mean=0.865; time: 0.5 min
## [Tune-x] 3: mtry=3; nodesize=12; ntree=18
## [Tune-y] 3: acc.test.mean=0.858; time: 0.2 min
## [Tune-x] 4: mtry=3; nodesize=17; ntree=59
## [Tune-y] 4: acc.test.mean=0.858; time: 0.8 min
## [Tune-x] 5: mtry=5; nodesize=12; ntree=4
## [Tune-y] 5: acc.test.mean=0.852; time: 0.1 min
## [Tune] Result: mtry=8; nodesize=14; ntree=79 : acc.test.mean=0.866
#Optimal hypertuned parameters
rf_tune$x
## $mtry
## [1] 8
## 
## $nodesize
## [1] 14
## 
## $ntree
## [1] 79
#Accuracy rate from Cross Validation
rf_tune$y
## acc.test.mean 
##     0.8660267
#Use hyperparameters for modeling
rf_tree <- setHyperPars(rf, par.vals = rf_tune$x)



#Train a model
rforest <- mlr::train(rf_tree, traintask)
getLearnerModel(rforest)
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 79L, importance = TRUE, mtry = 8L, nodesize = 14L) 
##                Type of random forest: classification
##                      Number of trees: 79
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 13.47%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 23063 1657  0.06703074
## >50K   2709 4973  0.35264254
#========================================================================



#Let's see how the test classification error changes as we increase the number of trees for tunned model  ( #number of trees VS test classification error)



rf.tunned_ind <- predict(rforest$learner.model, newtrain2, 
                    predict.all = T)$individual
head(rf.tunned_ind,2)
##   [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]    [,9]   
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" ">50K"  ">50K"  ">50K"  ">50K"  "<=50K" "<=50K" ">50K" 
##   [,10]   [,11]   [,12]   [,13]   [,14]   [,15]   [,16]   [,17]   [,18]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 ">50K"  "<=50K" "<=50K" ">50K"  "<=50K" ">50K"  "<=50K" ">50K"  "<=50K"
##   [,19]   [,20]   [,21]   [,22]   [,23]   [,24]   [,25]   [,26]   [,27]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" ">50K"  ">50K"  "<=50K"
##   [,28]   [,29]   [,30]   [,31]   [,32]   [,33]   [,34]   [,35]   [,36]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" ">50K"  "<=50K" ">50K"  ">50K"  "<=50K" ">50K"  ">50K"  ">50K" 
##   [,37]   [,38]   [,39]   [,40]   [,41]   [,42]   [,43]   [,44]   [,45]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" "<=50K" "<=50K" "<=50K"
##   [,46]   [,47]   [,48]   [,49]   [,50]   [,51]   [,52]   [,53]   [,54]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" ">50K"  ">50K"  "<=50K" "<=50K" ">50K"  "<=50K" "<=50K"
##   [,55]   [,56]   [,57]   [,58]   [,59]   [,60]   [,61]   [,62]   [,63]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" "<=50K" ">50K"  "<=50K" ">50K"  "<=50K" "<=50K"
##   [,64]   [,65]   [,66]   [,67]   [,68]   [,69]   [,70]   [,71]   [,72]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" "<=50K" "<=50K" ">50K"  ">50K"  "<=50K" "<=50K" "<=50K" "<=50K"
##   [,73]   [,74]   [,75]   [,76]   [,77]   [,78]   [,79]  
## 1 "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K" "<=50K"
## 2 "<=50K" ">50K"  "<=50K" "<=50K" ">50K"  ">50K"  "<=50K"
n <- dim(rf.tunned_ind)[1]
m <- ceiling(dim(rf.tunned_ind)[2] / 2)
predicted_ind <- c()
misclass.ind <- c()

for(i in 1:m){   # number of tree
  for(j in 1:n){
    predicted_ind[j] <- names(which.max(table(rf.tunned_ind[j, 1:i*2-1])))
  }
  misclass.ind[i] <- mean(predicted_ind != newtrain2$income)
}

rf.tunned.df <- data.frame(misclass.ind, ntree = seq(1, 80, 2))

ggplot(rf.untunned.df, aes(x = ntree, y = misclass.ind)) + geom_line() +
  ggtitle("Number of trees vs Misclassification rate in training dataset - tunned random forest model")

#========================================================================



#***Make plots for random forest model



#Variable importance statistics
varImpPlot(rforest$learner.model)

importance(rforest$learner.model)
##                            <=50K        >50K MeanDecreaseAccuracy
## age                   -4.5086634 34.16525780           36.4804145
## education.num         19.7742328 35.71472197           46.6996346
## capital.gain          52.7119955 77.24609078           73.3586739
## capital.loss          29.9711481 41.06776776           43.1336331
## hours.per.week        -0.4312341 29.02454045           25.6659384
## Local.gov              5.1492927  0.56298571            5.8821188
## No.gain               -0.8184111  1.00638984           -0.3717060
## Private               11.1016368 -2.23975243           10.5785695
## Self.emp.inc           1.1277732  5.82510793            7.7132744
## Self.emp.not.inc      15.5469946 -2.74505347           14.3523579
## State.gov             10.8733163 -4.28415056            6.9979343
## Married.AF.spouse     -3.7189943  4.02124089           -2.0878889
## Married.civ.spouse     6.2045729 40.26237708           29.5940091
## Married.spouse.absent -2.8969481  1.41648079           -1.9070204
## Never.married         -5.3562336  7.60306255            8.4697272
## Separated             -1.7153316  2.95840510            2.3026940
## Widowed                0.5964076  2.59756458            1.8897186
## Armed.Forces           0.0000000  0.00000000            0.0000000
## Craft.repair           5.4840128  3.58903776           10.4377926
## Exec.managerial        5.3670247 19.21079683           21.5580767
## Farming.fishing        9.9615691  6.73540097           13.6178460
## Handlers.cleaners      0.3974181  9.37932930            9.3095290
## Machine.op.inspct      3.6133410  6.91776737            9.1384941
## Other.service         -7.4759085 15.10751217           15.8621146
## Priv.house.serv       -0.8300094  2.45121607            1.2156159
## Prof.specialty         7.6140905 11.38433854           14.9140532
## Protective.serv       -4.2827246  8.95857684            5.1430640
## Sales                 -2.3749483  6.05115849            5.6645534
## Tech.support           2.1784476 16.92936100           12.9507474
## Transport.moving       4.7305436  2.42871893            7.5258776
## Not.in.family         -1.1500949  9.19151189           10.7172849
## Other.relative        -2.1896172  8.63515706            8.0793463
## Own.child              0.5788207  7.71050918            7.8753397
## Unmarried             -2.0795340  4.76099939            4.1781013
## Wife                   1.6250141  8.78884381            7.8250190
## Asian.Pac.Islander     2.5150741 -0.07866057            2.9269550
## Black                  3.6337768  2.12945346            5.1723703
## Other                 -0.5773645  1.53909851            0.8206658
## White                  3.9022122  4.50106434            6.7822017
## Male                   5.2159776  6.08248030           13.1814993
## other_countries        4.9064107  0.58701542            5.2145352
## Philippines           -2.5330590  4.79910859            2.6692177
## United.States          2.0788279  3.74229678            4.8721262
##                       MeanDecreaseGini
## age                       739.84239923
## education.num            1069.23964617
## capital.gain             1184.14284781
## capital.loss              373.78120327
## hours.per.week            497.04494609
## Local.gov                  34.45943727
## No.gain                     0.39987285
## Private                    61.12777811
## Self.emp.inc               47.35922677
## Self.emp.not.inc           66.06778851
## State.gov                  31.22933659
## Married.AF.spouse           5.71895825
## Married.civ.spouse       1257.57544070
## Married.spouse.absent       7.16077121
## Never.married             270.22189400
## Separated                  13.06962836
## Widowed                    11.51838286
## Armed.Forces                0.04649277
## Craft.repair               38.07298242
## Exec.managerial           202.26249938
## Farming.fishing            57.51966059
## Handlers.cleaners          28.50154305
## Machine.op.inspct          31.39069988
## Other.service              73.15964271
## Priv.house.serv             1.04174638
## Prof.specialty            125.41528470
## Protective.serv            23.02565963
## Sales                      41.36054495
## Tech.support               50.74963135
## Transport.moving           32.46031537
## Not.in.family             162.93055631
## Other.relative             12.64898998
## Own.child                  75.38402000
## Unmarried                  52.33314608
## Wife                       73.94834919
## Asian.Pac.Islander         21.75418947
## Black                      28.86962567
## Other                       7.23911786
## White                      37.00812346
## Male                      127.51952291
## other_countries            26.08404616
## Philippines                 8.72432373
## United.States              36.12278055
set.seed(100)
# ** Plot (top) subset of random forest tree
plot(rforest$learner.model)

#getTree(rforest$learner.model, k = 10, labelVar = TRUE)



# ** Make predictions on training dataset
rfclass1 <- predict(rforest, traintask)



#Confusion matrix on training dataset
confusionMatrix(rfclass1$data$response, rfclass1$data$truth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 23561  2189
##      >50K   1159  5493
##                                                
##                Accuracy : 0.8967               
##                  95% CI : (0.8933, 0.9)        
##     No Information Rate : 0.7629               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7005               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9531               
##             Specificity : 0.7150               
##          Pos Pred Value : 0.9150               
##          Neg Pred Value : 0.8258               
##              Prevalence : 0.7629               
##          Detection Rate : 0.7271               
##    Detection Prevalence : 0.7947               
##       Balanced Accuracy : 0.8341               
##                                                
##        'Positive' Class : <=50K                
## 
#Make random forest plots on training dataset
plot(rfclass1$data$response, newtrain2$income)
abline(0, 1)

#Training accuracy rate
1 - mean(rfclass1$data$response != newtrain2$income)
## [1] 0.896673
#Make predictions on test dataset
rfclass2 <- predict(rforest, testtask)



#Confusion matrix on test dataset
confusionMatrix(rfclass2$data$response, rfclass2$data$truth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11601  1362
##      >50K    834  2399
##                                                
##                Accuracy : 0.8644               
##                  95% CI : (0.859, 0.8696)      
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6002               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9329               
##             Specificity : 0.6379               
##          Pos Pred Value : 0.8949               
##          Neg Pred Value : 0.7420               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7163               
##    Detection Prevalence : 0.8004               
##       Balanced Accuracy : 0.7854               
##                                                
##        'Positive' Class : <=50K                
## 
#Make random forest plots on test dataset
plot(rfclass2$data$response, newtest2$income)
abline(0,1)

#Test accuracy rate
1 - mean(rfclass2$data$response != newtest2$income)
## [1] 0.864411

ROC and AUC

set.seed(100)
#ROC Curve: https://stackoverflow.com/questions/30818188/roc-curve-in-r-using-rpart-package
#Untunned random forest model
#Getting predicted >50K of income probabilities 
untunned.forest <- mlr::train(rf, traintask)
untunned.rf_prob <- predict(untunned.forest$learner.model,
                            newdata = newtest2, type = "prob")[, 2]
untunned.rf_prediction <- prediction(untunned.rf_prob, newtest2$income)
untunned.rf_performance <- ROCR::performance(untunned.rf_prediction, measure = "tpr", x.measure = "fpr")



#Plot ROC curve 
plot(untunned.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
untunned.rf.auc <- ROCR::performance(untunned.rf_prediction,
                                     measure = "auc")@y.values[[1]]
untunned.rf.auc
## [1] 0.8867613
#=====================================================================



#Tunned random forest model
#Getting predicted >50K of income probabilities 
tunned.rf_prob <- predict(rforest$learner.model, newdata = newtest2,
                     type = "prob")[, 2]
tunned.rf_prediction <- prediction(tunned.rf_prob, newtest2$income)
tunned.rf_performance <- ROCR::performance(tunned.rf_prediction, measure = "tpr", x.measure = "fpr")



#Plot ROC curve 
plot(tunned.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
tunned.rf.auc <- ROCR::performance(tunned.rf_prediction,
                                   measure="auc")@y.values[[1]]
tunned.rf.auc
## [1] 0.8962369

\(\\\)

\(\\\)

Compare ROC and AUC from three different tuned tree

set.seed(100)
#Compare ROC curve 
plot(tunned.rf_performance, main = "ROC curve", col = "blue")
plot(untunned.rf_performance, add = TRUE, col = "red")
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Tunned", "Untunned"), col = c("blue", "red"), lwd=3, cex=.8, horiz = TRUE)

\(\\\)

\(\\\)

Pick the best threshold for each model which leads best accuracy

set.seed(100)
thresholds <- seq(from = 0.001, 0.999, 0.001)
accuracy <- c()



#==================================================================



#Using train dataset to check new accuracy driven by  new threshold
untunned.rf_prob.train <- predict(untunned.forest$learner.model,
                            newdata = newtrain2, type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((untunned.rf_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres1 <- which.max(accuracy) * 0.001
thres1
## [1] 0.32
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(untunned.rf_prob > thres1, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11192  1104
##      TRUE   1243  2657
##                                                
##                Accuracy : 0.8551               
##                  95% CI : (0.8496, 0.8605)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5988               
##  Mcnemar's Test P-Value : 0.004392             
##                                                
##             Sensitivity : 0.9000               
##             Specificity : 0.7065               
##          Pos Pred Value : 0.9102               
##          Neg Pred Value : 0.6813               
##              Prevalence : 0.7678               
##          Detection Rate : 0.6910               
##    Detection Prevalence : 0.7592               
##       Balanced Accuracy : 0.8033               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
rf.untunned.accuracy <- mean((untunned.rf_prob > thres1) == (newtest2$income == ">50K"))



#compare the test accuracy by using default threshold (0.5)
rf.untunned.accuracy.half <- mean((untunned.rf_prob > 0.5) == (newtest2$income == ">50K"))




#==================================================================



#Using train dataset to check new accuracy driven by  new threshold
tunned.rf_prob.train <- predict(rforest$learner.model,
                            newdata = newtrain2, type = "prob")[, 2]



#Tuned by gini index splitting criterion model
for(i in 1:length(thresholds)){
  accuracy[i] <- mean((tunned.rf_prob.train > thresholds[i]) ==
                        (newtrain2$income == ">50K"))
}



#Threshold which give maximum accuracy
thres2 <- which.max(accuracy) * 0.001
thres2
## [1] 0.406
#plot of accuracy vs thresholds
threstable <- data.frame(thresholds, accuracy)
ggplot(threstable, aes(x = thresholds, y = accuracy)) + geom_point()

#Get confusion matrix of testset data using the optimal threshold
confusionMatrix(tunned.rf_prob > thres2, newtest2$income == ">50K")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE  TRUE
##      FALSE 11436  1257
##      TRUE    999  2504
##                                                
##                Accuracy : 0.8607               
##                  95% CI : (0.8553, 0.866)      
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5998               
##  Mcnemar's Test P-Value : 0.00000006273        
##                                                
##             Sensitivity : 0.9197               
##             Specificity : 0.6658               
##          Pos Pred Value : 0.9010               
##          Neg Pred Value : 0.7148               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7061               
##    Detection Prevalence : 0.7837               
##       Balanced Accuracy : 0.7927               
##                                                
##        'Positive' Class : FALSE                
## 
#Test accuracy rate by using optimal threshold
rf.tunned.accuracy <- mean((tunned.rf_prob > thres2) == (newtest2$income == ">50K"))



#compare the test accuracy by using default threshold (0.5)
rf.tunned.accuracy.half <- mean((tunned.rf_prob > 0.5) == (newtest2$income == ">50K"))

Compare AUC and Accuracy

set.seed(100)
#Compare AUC
auc <- data.frame(tunned.rf.auc, untunned.rf.auc)
auc[, order(auc)]
##   untunned.rf.auc tunned.rf.auc
## 1       0.8867613     0.8962369
#Pick the model with the largest AUC
final.auc3 <- rforest$learner.model



#Compare Accuracy - optimal threshold
accuracy.random.df <- data.frame(rf.tunned.accuracy, rf.untunned.accuracy)
accuracy.random.df[, order(accuracy.random.df)]
##   rf.untunned.accuracy rf.tunned.accuracy
## 1            0.8550877          0.8607063
#Pick the model with the highest Accuracy 
final.thres3 <- rforest$learner.model



#Compare Accuracy - default threshold(0.5)
accuracy.random.df.half <- data.frame(rf.tunned.accuracy.half,
                                      rf.untunned.accuracy.half)
accuracy.random.df.half[, order(accuracy.random.df.half)]
##   rf.tunned.accuracy.half rf.untunned.accuracy.half
## 1               0.8647814                  0.866078
#Pick the model with the largest Accuracy 
final.thres3.half <- untunned.forest$learner.model

Boosted Tree

Boosted Trees with five different models

set.seed(100)



#Change to binary digit
combined <- rbind(newtrain2, newtest2)
combined$income <- as.numeric(combined$income) - 1



#First model
boosting1 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
                interaction.depth = 5)
summary(boosting1)

##                                         var       rel.inf
## Married.civ.spouse       Married.civ.spouse 38.1837367227
## education.num                 education.num 20.9431635081
## capital.gain                   capital.gain 19.4087323351
## age                                     age  5.9208507786
## capital.loss                   capital.loss  5.6876403549
## hours.per.week               hours.per.week  4.1947843573
## Exec.managerial             Exec.managerial  2.1507741422
## Prof.specialty               Prof.specialty  0.7081632260
## Farming.fishing             Farming.fishing  0.6308016194
## Other.service                 Other.service  0.4673220145
## Wife                                   Wife  0.4662046626
## Self.emp.not.inc           Self.emp.not.inc  0.4328393928
## Tech.support                   Tech.support  0.2974840537
## Male                                   Male  0.2127979776
## Self.emp.inc                   Self.emp.inc  0.0755118841
## Sales                                 Sales  0.0653378908
## Machine.op.inspct         Machine.op.inspct  0.0257029903
## Handlers.cleaners         Handlers.cleaners  0.0256512584
## Married.AF.spouse         Married.AF.spouse  0.0189114782
## White                                 White  0.0172110098
## Local.gov                         Local.gov  0.0118613048
## Protective.serv             Protective.serv  0.0118433164
## Not.in.family                 Not.in.family  0.0098799096
## Never.married                 Never.married  0.0091057857
## Own.child                         Own.child  0.0083974189
## United.States                 United.States  0.0068051014
## Philippines                     Philippines  0.0026801348
## other_countries             other_countries  0.0017482677
## Transport.moving           Transport.moving  0.0012086023
## Private                             Private  0.0011931320
## Unmarried                         Unmarried  0.0007028850
## Asian.Pac.Islander       Asian.Pac.Islander  0.0003466238
## State.gov                         State.gov  0.0003137560
## Black                                 Black  0.0002921043
## No.gain                             No.gain  0.0000000000
## Married.spouse.absent Married.spouse.absent  0.0000000000
## Separated                         Separated  0.0000000000
## Widowed                             Widowed  0.0000000000
## Armed.Forces                   Armed.Forces  0.0000000000
## Craft.repair                   Craft.repair  0.0000000000
## Priv.house.serv             Priv.house.serv  0.0000000000
## Other.relative               Other.relative  0.0000000000
## Other                                 Other  0.0000000000
varImp(boosting1, numTrees = 5000)
##                             Overall
## age                    41799.956531
## education.num         147854.313003
## capital.gain          137021.552859
## capital.loss           40153.540174
## hours.per.week         29614.291991
## Local.gov                 83.738308
## No.gain                    0.000000
## Private                    8.423260
## Self.emp.inc             533.097960
## Self.emp.not.inc        3055.754736
## State.gov                  2.215051
## Married.AF.spouse        133.511044
## Married.civ.spouse    269569.120198
## Married.spouse.absent      0.000000
## Never.married             64.284925
## Separated                  0.000000
## Widowed                    0.000000
## Armed.Forces               0.000000
## Craft.repair               0.000000
## Exec.managerial        15184.011389
## Farming.fishing         4453.326263
## Handlers.cleaners        181.092469
## Machine.op.inspct        181.457685
## Other.service           3299.194765
## Priv.house.serv            0.000000
## Prof.specialty          4999.482874
## Protective.serv           83.611314
## Sales                    461.271715
## Tech.support            2100.174616
## Transport.moving           8.532477
## Not.in.family             69.750076
## Other.relative             0.000000
## Own.child                 59.284005
## Unmarried                  4.962219
## Wife                    3291.306496
## Asian.Pac.Islander         2.447091
## Black                      2.062194
## Other                      0.000000
## White                    121.506095
## Male                    1502.308798
## other_countries           12.342401
## Philippines               18.921186
## United.States             48.042579
#Test error of the first model
set.seed(100)
testerror1 <- c()
thresh <- 0.5
for(i in 1:500){
  #If I do not type = "response", they will give you logit output.
  yhat <- predict(boosting1, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  testerror1[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror1)

#ROC curve - testing
pos1 <- c()
pos1 <- predict(boosting1, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
predicts1 <- prediction(pos1, combined[32403:48598, 44])
roc1 <- ROCR::performance(predicts1, measure = "tpr", x.measure = "fpr")
plot(roc1)
abline(0, 1, col = "red")

auc1 <- ROCR::performance(predicts1, measure = "auc")
auc1@y.values
## [[1]]
## [1] 0.913919
#Train error of the first model
set.seed(100)
trainerror1 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting1, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  trainerror1[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror1)

#ROC curve - training
pos1b <- c()
pos1b <- predict(boosting1, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts1b <- prediction(pos1b, combined[1:32402, 44])
roc1b <- ROCR::performance(predicts1b, measure = "tpr", x.measure = "fpr")
plot(roc1b)
abline(0, 1, col = "red")

auc1b <- ROCR::performance(predicts1b, measure = "auc")
auc1b@y.values
## [[1]]
## [1] 0.9161409
#Second model
set.seed(100)
boosting2 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 2000,
                interaction.depth = 5)
summary(boosting2)

##                                         var      rel.inf
## Married.civ.spouse       Married.civ.spouse 43.183739803
## education.num                 education.num 22.269924180
## capital.gain                   capital.gain 20.182879377
## age                                     age  4.792613523
## capital.loss                   capital.loss  4.080112884
## hours.per.week               hours.per.week  3.115147219
## Exec.managerial             Exec.managerial  1.571469479
## Prof.specialty               Prof.specialty  0.461127929
## Farming.fishing             Farming.fishing  0.183178723
## Self.emp.not.inc           Self.emp.not.inc  0.063849502
## Other.service                 Other.service  0.042141531
## Wife                                   Wife  0.020667103
## Tech.support                   Tech.support  0.020408954
## Male                                   Male  0.005805212
## Sales                                 Sales  0.002968312
## Self.emp.inc                   Self.emp.inc  0.002497470
## Never.married                 Never.married  0.001468800
## Local.gov                         Local.gov  0.000000000
## No.gain                             No.gain  0.000000000
## Private                             Private  0.000000000
## State.gov                         State.gov  0.000000000
## Married.AF.spouse         Married.AF.spouse  0.000000000
## Married.spouse.absent Married.spouse.absent  0.000000000
## Separated                         Separated  0.000000000
## Widowed                             Widowed  0.000000000
## Armed.Forces                   Armed.Forces  0.000000000
## Craft.repair                   Craft.repair  0.000000000
## Handlers.cleaners         Handlers.cleaners  0.000000000
## Machine.op.inspct         Machine.op.inspct  0.000000000
## Priv.house.serv             Priv.house.serv  0.000000000
## Protective.serv             Protective.serv  0.000000000
## Transport.moving           Transport.moving  0.000000000
## Not.in.family                 Not.in.family  0.000000000
## Other.relative               Other.relative  0.000000000
## Own.child                         Own.child  0.000000000
## Unmarried                         Unmarried  0.000000000
## Asian.Pac.Islander       Asian.Pac.Islander  0.000000000
## Black                                 Black  0.000000000
## Other                                 Other  0.000000000
## White                                 White  0.000000000
## other_countries             other_countries  0.000000000
## Philippines                     Philippines  0.000000000
## United.States                 United.States  0.000000000
varImp(boosting2, numTrees = 2000)
##                             Overall
## age                    29254.355185
## education.num         135936.742808
## capital.gain          123197.315843
## capital.loss           24905.215267
## hours.per.week         19015.016079
## Local.gov                  0.000000
## No.gain                    0.000000
## Private                    0.000000
## Self.emp.inc              15.244682
## Self.emp.not.inc         389.740585
## State.gov                  0.000000
## Married.AF.spouse          0.000000
## Married.civ.spouse    263595.730442
## Married.spouse.absent      0.000000
## Never.married              8.965628
## Separated                  0.000000
## Widowed                    0.000000
## Armed.Forces               0.000000
## Craft.repair               0.000000
## Exec.managerial         9592.329127
## Farming.fishing         1118.132183
## Handlers.cleaners          0.000000
## Machine.op.inspct          0.000000
## Other.service            257.234036
## Priv.house.serv            0.000000
## Prof.specialty          2814.748184
## Protective.serv            0.000000
## Sales                     18.118727
## Tech.support             124.577289
## Transport.moving           0.000000
## Not.in.family              0.000000
## Other.relative             0.000000
## Own.child                  0.000000
## Unmarried                  0.000000
## Wife                     126.153040
## Asian.Pac.Islander         0.000000
## Black                      0.000000
## Other                      0.000000
## White                      0.000000
## Male                      35.435306
## other_countries            0.000000
## Philippines                0.000000
## United.States              0.000000
#Test error of the second model
set.seed(100)
testerror2 <- c()
thresh <- 0.5
for(i in 1:200){
  yhat <- predict(boosting2, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  testerror2[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror2)

#ROC curve - testing
pos2 <- c()
pos2 <- predict(boosting2, newdata = combined[32403:48598, -44], n.trees = 2000, type = "response")
predicts2 <- prediction(pos2, combined[32403:48598, 44])
roc2 <- ROCR::performance(predicts2, measure = "tpr", x.measure = "fpr")
plot(roc2)
abline(0, 1, col = "red")

auc2 <- ROCR::performance(predicts2, measure = "auc")
auc2@y.values
## [[1]]
## [1] 0.9026846
#Train error of the second model
set.seed(100)
trainerror2 <- c()
thresh <- 0.5
for(i in 1:200){
  yhat <- predict(boosting2, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  trainerror2[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror2)

#ROC curve - training
pos2b <- c()
pos2b <- predict(boosting2, newdata = combined[1:32402, -44], n.trees = 2000, type = "response")
predicts2b <- prediction(pos2b, combined[1:32402, 44])
roc2b <- ROCR::performance(predicts2b, measure = "tpr", x.measure = "fpr")
plot(roc2b)
abline(0, 1, col = "red")

auc2b <- ROCR::performance(predicts2b, measure = "auc")
auc2b@y.values
## [[1]]
## [1] 0.9032523
#Third model
set.seed(100)
boosting3 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
                interaction.depth = 3)
summary(boosting3)

##                                         var       rel.inf
## Married.civ.spouse       Married.civ.spouse 38.4653932551
## capital.gain                   capital.gain 21.5925456394
## education.num                 education.num 20.6266997102
## age                                     age  5.9262919869
## capital.loss                   capital.loss  4.9121468429
## hours.per.week               hours.per.week  4.1693779643
## Exec.managerial             Exec.managerial  2.1015494026
## Prof.specialty               Prof.specialty  0.5954335020
## Farming.fishing             Farming.fishing  0.5462841653
## Other.service                 Other.service  0.3328796337
## Wife                                   Wife  0.2756071261
## Self.emp.not.inc           Self.emp.not.inc  0.2224774222
## Tech.support                   Tech.support  0.1327361634
## Male                                   Male  0.0496445933
## Self.emp.inc                   Self.emp.inc  0.0362639675
## Sales                                 Sales  0.0067688417
## Married.AF.spouse         Married.AF.spouse  0.0028144804
## Not.in.family                 Not.in.family  0.0022786683
## Local.gov                         Local.gov  0.0012623225
## United.States                 United.States  0.0009382238
## White                                 White  0.0006060883
## No.gain                             No.gain  0.0000000000
## Private                             Private  0.0000000000
## State.gov                         State.gov  0.0000000000
## Married.spouse.absent Married.spouse.absent  0.0000000000
## Never.married                 Never.married  0.0000000000
## Separated                         Separated  0.0000000000
## Widowed                             Widowed  0.0000000000
## Armed.Forces                   Armed.Forces  0.0000000000
## Craft.repair                   Craft.repair  0.0000000000
## Handlers.cleaners         Handlers.cleaners  0.0000000000
## Machine.op.inspct         Machine.op.inspct  0.0000000000
## Priv.house.serv             Priv.house.serv  0.0000000000
## Protective.serv             Protective.serv  0.0000000000
## Transport.moving           Transport.moving  0.0000000000
## Other.relative               Other.relative  0.0000000000
## Own.child                         Own.child  0.0000000000
## Unmarried                         Unmarried  0.0000000000
## Asian.Pac.Islander       Asian.Pac.Islander  0.0000000000
## Black                                 Black  0.0000000000
## Other                                 Other  0.0000000000
## other_countries             other_countries  0.0000000000
## Philippines                     Philippines  0.0000000000
varImp(boosting3, numTrees = 5000)
##                             Overall
## age                    39889.279469
## education.num         138836.255637
## capital.gain          145337.268121
## capital.loss           33063.169794
## hours.per.week         28063.666657
## Local.gov                  8.496567
## No.gain                    0.000000
## Private                    0.000000
## Self.emp.inc             244.089143
## Self.emp.not.inc        1497.473309
## State.gov                  0.000000
## Married.AF.spouse         18.943987
## Married.civ.spouse    258906.720229
## Married.spouse.absent      0.000000
## Never.married              0.000000
## Separated                  0.000000
## Widowed                    0.000000
## Armed.Forces               0.000000
## Craft.repair               0.000000
## Exec.managerial        14145.319134
## Farming.fishing         3676.984156
## Handlers.cleaners          0.000000
## Machine.op.inspct          0.000000
## Other.service           2240.579568
## Priv.house.serv            0.000000
## Prof.specialty          4007.803433
## Protective.serv            0.000000
## Sales                     45.560398
## Tech.support             893.433859
## Transport.moving           0.000000
## Not.in.family             15.337489
## Other.relative             0.000000
## Own.child                  0.000000
## Unmarried                  0.000000
## Wife                    1855.084039
## Asian.Pac.Islander         0.000000
## Black                      0.000000
## Other                      0.000000
## White                      4.079520
## Male                     334.152799
## other_countries            0.000000
## Philippines                0.000000
## United.States              6.315091
#Test error of the third model
set.seed(100)
testerror3 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting3, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  testerror3[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror3)

#ROC curve - testing
pos3 <- c()
pos3 <- predict(boosting3, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
predicts3 <- prediction(pos3, combined[32403:48598, 44])
roc3 <- ROCR::performance(predicts3, measure = "tpr", x.measure = "fpr")
plot(roc3)
abline(0, 1, col = "red")

auc3 <- ROCR::performance(predicts3, measure = "auc")
auc3@y.values
## [[1]]
## [1] 0.9086526
#Train error of the third model
set.seed(100)
trainerror3 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting3, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  trainerror3[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror3)

#ROC curve - training
pos3b <- c()
pos3b <- predict(boosting3, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts3b <- prediction(pos3b, combined[1:32402, 44])
roc3b <- ROCR::performance(predicts3b, measure = "tpr", x.measure = "fpr")
plot(roc3b)
abline(0, 1, col = "red")

auc3b <- ROCR::performance(predicts3b, measure = "auc")
auc3b@y.values
## [[1]]
## [1] 0.9100084
#Fourth model
set.seed(100)
boosting4 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
                interaction.depth = 3, shrinkage = 0.2)
summary(boosting4)

##                                         var      rel.inf
## Married.civ.spouse       Married.civ.spouse 18.026448560
## age                                     age 16.972500853
## education.num                 education.num 14.284120321
## hours.per.week               hours.per.week 12.663868474
## capital.gain                   capital.gain  9.733799355
## capital.loss                   capital.loss  4.043940259
## Exec.managerial             Exec.managerial  2.339896499
## Wife                                   Wife  2.312137098
## Self.emp.inc                   Self.emp.inc  1.477410233
## Sales                                 Sales  1.421837748
## Self.emp.not.inc           Self.emp.not.inc  1.363968271
## Prof.specialty               Prof.specialty  1.270072643
## Craft.repair                   Craft.repair  1.217346462
## Private                             Private  0.979140053
## Local.gov                         Local.gov  0.962075427
## Tech.support                   Tech.support  0.956414372
## Transport.moving           Transport.moving  0.929744557
## Protective.serv             Protective.serv  0.906260989
## Married.AF.spouse         Married.AF.spouse  0.847841879
## Male                                   Male  0.735702554
## State.gov                         State.gov  0.701580425
## Farming.fishing             Farming.fishing  0.614732088
## Not.in.family                 Not.in.family  0.542609381
## White                                 White  0.528052568
## other_countries             other_countries  0.515926852
## Asian.Pac.Islander       Asian.Pac.Islander  0.511077925
## United.States                 United.States  0.447415310
## Machine.op.inspct         Machine.op.inspct  0.424901024
## Never.married                 Never.married  0.387397190
## Other.service                 Other.service  0.383110797
## Black                                 Black  0.366741057
## Philippines                     Philippines  0.223151705
## Unmarried                         Unmarried  0.220840811
## Widowed                             Widowed  0.168091780
## Handlers.cleaners         Handlers.cleaners  0.138868006
## Married.spouse.absent Married.spouse.absent  0.137691429
## Separated                         Separated  0.100353469
## Other                                 Other  0.049692624
## Other.relative               Other.relative  0.040436779
## Own.child                         Own.child  0.038139196
## No.gain                             No.gain  0.007916710
## Priv.house.serv             Priv.house.serv  0.006746267
## Armed.Forces                   Armed.Forces  0.000000000
varImp(boosting4, numTrees = 5000)
##                            Overall
## age                   1625.9627048
## education.num         1368.4163056
## capital.gain           932.4963283
## capital.loss           387.4087914
## hours.per.week        1213.1964533
## Local.gov               92.1666628
## No.gain                  0.7584195
## Private                 93.8014511
## Self.emp.inc           141.5356499
## Self.emp.not.inc       130.6679292
## State.gov               67.2112858
## Married.AF.spouse       81.2231083
## Married.civ.spouse    1726.9307165
## Married.spouse.absent   13.1908156
## Never.married           37.1125852
## Separated                9.6138453
## Widowed                 16.1031641
## Armed.Forces             0.0000000
## Craft.repair           116.6215847
## Exec.managerial        224.1616879
## Farming.fishing         58.8912299
## Handlers.cleaners       13.3035314
## Machine.op.inspct       40.7054461
## Other.service           36.7019494
## Priv.house.serv          0.6462912
## Prof.specialty         121.6727439
## Protective.serv         86.8196491
## Sales                  136.2118152
## Tech.support            91.6243347
## Transport.moving        89.0693709
## Not.in.family           51.9818867
## Other.relative           3.8738366
## Own.child                3.6537285
## Unmarried               21.1565123
## Wife                   221.5023421
## Asian.Pac.Islander      48.9611786
## Black                   35.1337311
## Other                    4.7605449
## White                   50.5873465
## Male                    70.4801800
## other_countries         49.4257050
## Philippines             21.3778955
## United.States           42.8623109
#Test error of the fourth model
set.seed(100)
testerror4 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting4, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  testerror4[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror4)

#ROC curve - testing
pos4 <- c()
pos4 <- predict(boosting4, newdata = combined[32403:48598, -44], n.trees = 150, type = "response")
predicts4 <- prediction(pos4, combined[32403:48598, 44])
roc4 <- ROCR::performance(predicts4, measure = "tpr", x.measure = "fpr")
plot(roc4)
abline(0, 1, col = "red")

auc4 <- ROCR::performance(predicts4, measure = "auc")
auc4@y.values
## [[1]]
## [1] 0.9209653
#Train error of the fourth model
set.seed(100)
trainerror4 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting4, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  trainerror4[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror4)

#ROC curve - training
pos4b <- c()
pos4b <- predict(boosting4, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts4b <- prediction(pos4b, combined[1:32402, 44])
roc4b <- ROCR::performance(predicts4b, measure = "tpr", x.measure = "fpr")
plot(roc4b)
abline(0, 1, col = "red")

auc4b <- ROCR::performance(predicts4b, measure = "auc")
auc4b@y.values
## [[1]]
## [1] 0.9556233
#Fifth model
set.seed(100)
boosting5 <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 5000,
                interaction.depth = 3, shrinkage = 0.1)
summary(boosting5)

##                                         var      rel.inf
## Married.civ.spouse       Married.civ.spouse 21.749111849
## education.num                 education.num 15.520807408
## age                                     age 13.841105494
## capital.gain                   capital.gain 13.128327788
## hours.per.week               hours.per.week 10.656667597
## capital.loss                   capital.loss  5.072048205
## Exec.managerial             Exec.managerial  2.049942975
## Wife                                   Wife  1.926472668
## Prof.specialty               Prof.specialty  1.288391194
## Self.emp.not.inc           Self.emp.not.inc  1.139338856
## Self.emp.inc                   Self.emp.inc  1.033026573
## Sales                                 Sales  1.017303679
## Private                             Private  0.863058329
## Married.AF.spouse         Married.AF.spouse  0.823091712
## Tech.support                   Tech.support  0.774572610
## Craft.repair                   Craft.repair  0.742961349
## Local.gov                         Local.gov  0.739759906
## Protective.serv             Protective.serv  0.687410865
## Transport.moving           Transport.moving  0.670745956
## Male                                   Male  0.641166610
## Farming.fishing             Farming.fishing  0.618918294
## State.gov                         State.gov  0.567625482
## Not.in.family                 Not.in.family  0.463684678
## White                                 White  0.435534037
## Asian.Pac.Islander       Asian.Pac.Islander  0.430976991
## Other.service                 Other.service  0.422750642
## other_countries             other_countries  0.353529212
## Machine.op.inspct         Machine.op.inspct  0.329448541
## United.States                 United.States  0.307138100
## Never.married                 Never.married  0.282278336
## Black                                 Black  0.268746977
## Unmarried                         Unmarried  0.228712965
## Philippines                     Philippines  0.222233474
## Widowed                             Widowed  0.167860893
## Handlers.cleaners         Handlers.cleaners  0.156685930
## Married.spouse.absent Married.spouse.absent  0.106604852
## Separated                         Separated  0.106553495
## Own.child                         Own.child  0.063757082
## Other                                 Other  0.053894161
## Other.relative               Other.relative  0.036158013
## No.gain                             No.gain  0.007362428
## Priv.house.serv             Priv.house.serv  0.004233794
## Armed.Forces                   Armed.Forces  0.000000000
varImp(boosting5, numTrees = 5000)
##                            Overall
## age                   1885.5216440
## education.num         2114.3411062
## capital.gain          1788.4226230
## capital.loss           690.9460140
## hours.per.week        1451.7176692
## Local.gov              100.7747043
## No.gain                  1.0029558
## Private                117.5711839
## Self.emp.inc           140.7253173
## Self.emp.not.inc       155.2078390
## State.gov               77.3254804
## Married.AF.spouse      112.1266822
## Married.civ.spouse    2962.7995501
## Married.spouse.absent   14.5223773
## Never.married           38.4537139
## Separated               14.5153811
## Widowed                 22.8670569
## Armed.Forces             0.0000000
## Craft.repair           101.2108249
## Exec.managerial        279.2560067
## Farming.fishing         84.3129070
## Handlers.cleaners       21.3447339
## Machine.op.inspct       44.8795333
## Other.service           57.5897270
## Priv.house.serv          0.5767538
## Prof.specialty         175.5126773
## Protective.serv         93.6433918
## Sales                  138.5834467
## Tech.support           105.5171079
## Transport.moving        91.3731940
## Not.in.family           63.1660164
## Other.relative           4.9256698
## Own.child                8.6853870
## Unmarried               31.1567054
## Wife                   262.4361121
## Asian.Pac.Islander      58.7103714
## Black                   36.6103879
## Other                    7.3417984
## White                   59.3311606
## Male                    87.3437111
## other_countries         48.1599524
## Philippines             30.2740288
## United.States           41.8402660
#Test error of the fifth model
set.seed(100)
testerror5 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting5, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  testerror5[i] <- mean(yhat != combined[32403:48598, 44])
}
plot(testerror5)

#ROC curve - testing
pos5 <- c()
pos5 <- predict(boosting5, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
predicts5 <- prediction(pos5, combined[32403:48598, 44])
roc5 <- ROCR::performance(predicts5, measure = "tpr", x.measure = "fpr")
plot(roc5)
abline(0, 1, col = "red")

auc5 <- ROCR::performance(predicts5, measure = "auc")
auc5@y.values
## [[1]]
## [1] 0.9231948
#Train error of the fifth model
set.seed(100)
trainerror5 <- c()
thresh <- 0.5
for(i in 1:500){
  yhat <- predict(boosting5, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
  yhat <- (yhat > thresh)
  trainerror5[i] <- mean(yhat != combined[1:32402, 44])
}
plot(trainerror5)

#ROC curve - training
pos5b <- c()
pos5b <- predict(boosting5, newdata = combined[1:32402, -44], n.trees = 5000, type = "response")
predicts5b <- prediction(pos5b, combined[1:32402, 44])
roc5b <- ROCR::performance(predicts5b, measure = "tpr", x.measure = "fpr")
plot(roc5b)
abline(0, 1, col = "red")

auc5b <- ROCR::performance(predicts5b, measure = "auc")
auc5b@y.values
## [[1]]
## [1] 0.9496323
#ROC and AUC combined testing 
plot(roc1, type = "l", col = "red")
par(new = TRUE)
plot(roc2, type = "l", col = "green")
par(new = TRUE)
plot(roc3, type = "l", col = "blue")
par(new = TRUE)
plot(roc4, type = "l", col = "black")
par(new = TRUE)
plot(roc5, type = "l", col = "yellow",
     main = "model1: red, model2: green, model3: blue, model4: black, model5: yellow")

paste("AUC for model 1 is", round(auc1@y.values[[1]], 5))
## [1] "AUC for model 1 is 0.91392"
paste("AUC for model 2 is", round(auc2@y.values[[1]], 5))
## [1] "AUC for model 2 is 0.90268"
paste("AUC for model 3 is", round(auc3@y.values[[1]], 5))
## [1] "AUC for model 3 is 0.90865"
paste("AUC for model 4 is", round(auc4@y.values[[1]], 5))
## [1] "AUC for model 4 is 0.92097"
paste("AUC for model 5 is", round(auc5@y.values[[1]], 5))
## [1] "AUC for model 5 is 0.92319"
#ROC and AUC combined training 
plot(roc1b, type = "l", col = "red")
par(new = TRUE)
plot(roc2b, type = "l", col = "green")
par(new = TRUE)
plot(roc3b, type = "l", col = "blue")
par(new = TRUE)
plot(roc4b, type = "l", col = "black")
par(new = TRUE)
plot(roc5b, type = "l", col = "yellow",
     main = "model1: red, model2: green, model3: blue, model4: black, model5: yellow")

paste("AUC for model 1 is", round(auc1b@y.values[[1]], 5))
## [1] "AUC for model 1 is 0.91614"
paste("AUC for model 2 is", round(auc2b@y.values[[1]], 5))
## [1] "AUC for model 2 is 0.90325"
paste("AUC for model 3 is", round(auc3b@y.values[[1]], 5))
## [1] "AUC for model 3 is 0.91001"
paste("AUC for model 4 is", round(auc4b@y.values[[1]], 5))
## [1] "AUC for model 4 is 0.95562"
paste("AUC for model 5 is", round(auc5b@y.values[[1]], 5))
## [1] "AUC for model 5 is 0.94963"
#Partial dependence plots
variables <- c("Married.civ.spouse", "education.num", "age", "capital.gain",
               "hours.per.week", "capital.loss")
par(mfrow = c(2, 3))
for(i in 1:6){
  plot(boosting1, i = variables[i])
}

for(i in 1:6){
  plot(boosting2, i = variables[i])
}

for(i in 1:6){
  plot(boosting3, i = variables[i])
}

for(i in 1:6){
  plot(boosting4, i = variables[i])
}

for(i in 1:6){
  plot(boosting5, i = variables[i])
}

#Check imbalance
table(combined$income)
## 
##     0     1 
## 37155 11443
11443 / 48598 #23.5%
## [1] 0.2354624
37155 / 48598 #76.5%
## [1] 0.7645376

\(\\\)

\(\\\)

Train function

set.seed(100)
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
boostingtrain <- caret::train(income~., data = newtrain2, method = "gbm", metric = "Accuracy", trControl = trctrl)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0185
##      2        1.0284             nan     0.1000    0.0149
##      3        1.0038             nan     0.1000    0.0122
##      4        0.9833             nan     0.1000    0.0100
##      5        0.9639             nan     0.1000    0.0097
##      6        0.9472             nan     0.1000    0.0080
##      7        0.9302             nan     0.1000    0.0083
##      8        0.9174             nan     0.1000    0.0062
##      9        0.9075             nan     0.1000    0.0049
##     10        0.8949             nan     0.1000    0.0063
##     20        0.8154             nan     0.1000    0.0023
##     40        0.7375             nan     0.1000    0.0012
##     60        0.6956             nan     0.1000    0.0010
##     80        0.6715             nan     0.1000    0.0006
##    100        0.6560             nan     0.1000    0.0002
##    120        0.6452             nan     0.1000    0.0001
##    140        0.6367             nan     0.1000    0.0003
##    150        0.6334             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0438             nan     0.1000    0.0260
##      2        1.0038             nan     0.1000    0.0197
##      3        0.9699             nan     0.1000    0.0165
##      4        0.9434             nan     0.1000    0.0132
##      5        0.9141             nan     0.1000    0.0146
##      6        0.8911             nan     0.1000    0.0116
##      7        0.8742             nan     0.1000    0.0082
##      8        0.8586             nan     0.1000    0.0080
##      9        0.8429             nan     0.1000    0.0079
##     10        0.8321             nan     0.1000    0.0054
##     20        0.7506             nan     0.1000    0.0022
##     40        0.6788             nan     0.1000    0.0008
##     60        0.6486             nan     0.1000    0.0004
##     80        0.6321             nan     0.1000    0.0002
##    100        0.6210             nan     0.1000    0.0002
##    120        0.6127             nan     0.1000    0.0001
##    140        0.6062             nan     0.1000    0.0001
##    150        0.6032             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0367             nan     0.1000    0.0296
##      2        0.9886             nan     0.1000    0.0236
##      3        0.9516             nan     0.1000    0.0183
##      4        0.9208             nan     0.1000    0.0154
##      5        0.8950             nan     0.1000    0.0129
##      6        0.8698             nan     0.1000    0.0124
##      7        0.8497             nan     0.1000    0.0100
##      8        0.8337             nan     0.1000    0.0080
##      9        0.8169             nan     0.1000    0.0082
##     10        0.8030             nan     0.1000    0.0070
##     20        0.7199             nan     0.1000    0.0027
##     40        0.6546             nan     0.1000    0.0010
##     60        0.6279             nan     0.1000    0.0002
##     80        0.6132             nan     0.1000    0.0001
##    100        0.6013             nan     0.1000    0.0001
##    120        0.5948             nan     0.1000    0.0002
##    140        0.5893             nan     0.1000    0.0001
##    150        0.5867             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0187
##      2        1.0279             nan     0.1000    0.0150
##      3        1.0036             nan     0.1000    0.0122
##      4        0.9832             nan     0.1000    0.0104
##      5        0.9636             nan     0.1000    0.0098
##      6        0.9491             nan     0.1000    0.0070
##      7        0.9328             nan     0.1000    0.0083
##      8        0.9173             nan     0.1000    0.0078
##      9        0.9072             nan     0.1000    0.0048
##     10        0.8946             nan     0.1000    0.0062
##     20        0.8117             nan     0.1000    0.0033
##     40        0.7370             nan     0.1000    0.0010
##     60        0.6943             nan     0.1000    0.0007
##     80        0.6707             nan     0.1000    0.0004
##    100        0.6552             nan     0.1000    0.0003
##    120        0.6448             nan     0.1000    0.0001
##    140        0.6365             nan     0.1000    0.0000
##    150        0.6329             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0431             nan     0.1000    0.0257
##      2        1.0030             nan     0.1000    0.0200
##      3        0.9693             nan     0.1000    0.0170
##      4        0.9415             nan     0.1000    0.0137
##      5        0.9126             nan     0.1000    0.0144
##      6        0.8891             nan     0.1000    0.0116
##      7        0.8715             nan     0.1000    0.0090
##      8        0.8541             nan     0.1000    0.0087
##      9        0.8392             nan     0.1000    0.0076
##     10        0.8281             nan     0.1000    0.0056
##     20        0.7486             nan     0.1000    0.0027
##     40        0.6755             nan     0.1000    0.0010
##     60        0.6468             nan     0.1000    0.0005
##     80        0.6319             nan     0.1000    0.0003
##    100        0.6195             nan     0.1000    0.0003
##    120        0.6115             nan     0.1000    0.0003
##    140        0.6053             nan     0.1000    0.0001
##    150        0.6023             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0359             nan     0.1000    0.0296
##      2        0.9895             nan     0.1000    0.0236
##      3        0.9521             nan     0.1000    0.0185
##      4        0.9194             nan     0.1000    0.0157
##      5        0.8942             nan     0.1000    0.0124
##      6        0.8710             nan     0.1000    0.0113
##      7        0.8490             nan     0.1000    0.0109
##      8        0.8308             nan     0.1000    0.0091
##      9        0.8155             nan     0.1000    0.0077
##     10        0.8019             nan     0.1000    0.0067
##     20        0.7189             nan     0.1000    0.0026
##     40        0.6532             nan     0.1000    0.0008
##     60        0.6278             nan     0.1000    0.0002
##     80        0.6123             nan     0.1000    0.0004
##    100        0.6011             nan     0.1000    0.0001
##    120        0.5929             nan     0.1000    0.0001
##    140        0.5870             nan     0.1000    0.0001
##    150        0.5844             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0577             nan     0.1000    0.0186
##      2        1.0270             nan     0.1000    0.0149
##      3        1.0030             nan     0.1000    0.0121
##      4        0.9828             nan     0.1000    0.0097
##      5        0.9629             nan     0.1000    0.0096
##      6        0.9456             nan     0.1000    0.0084
##      7        0.9296             nan     0.1000    0.0077
##      8        0.9168             nan     0.1000    0.0067
##      9        0.9039             nan     0.1000    0.0064
##     10        0.8914             nan     0.1000    0.0062
##     20        0.8145             nan     0.1000    0.0023
##     40        0.7356             nan     0.1000    0.0014
##     60        0.6942             nan     0.1000    0.0010
##     80        0.6687             nan     0.1000    0.0005
##    100        0.6541             nan     0.1000    0.0002
##    120        0.6431             nan     0.1000    0.0001
##    140        0.6353             nan     0.1000    0.0003
##    150        0.6322             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0433             nan     0.1000    0.0257
##      2        1.0031             nan     0.1000    0.0197
##      3        0.9700             nan     0.1000    0.0163
##      4        0.9425             nan     0.1000    0.0136
##      5        0.9135             nan     0.1000    0.0147
##      6        0.8906             nan     0.1000    0.0115
##      7        0.8709             nan     0.1000    0.0097
##      8        0.8562             nan     0.1000    0.0074
##      9        0.8425             nan     0.1000    0.0067
##     10        0.8308             nan     0.1000    0.0056
##     20        0.7479             nan     0.1000    0.0035
##     40        0.6767             nan     0.1000    0.0012
##     60        0.6450             nan     0.1000    0.0005
##     80        0.6295             nan     0.1000    0.0003
##    100        0.6194             nan     0.1000    0.0001
##    120        0.6106             nan     0.1000    0.0001
##    140        0.6040             nan     0.1000    0.0001
##    150        0.6014             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0362             nan     0.1000    0.0293
##      2        0.9906             nan     0.1000    0.0227
##      3        0.9517             nan     0.1000    0.0202
##      4        0.9212             nan     0.1000    0.0154
##      5        0.8957             nan     0.1000    0.0128
##      6        0.8732             nan     0.1000    0.0111
##      7        0.8513             nan     0.1000    0.0110
##      8        0.8333             nan     0.1000    0.0091
##      9        0.8173             nan     0.1000    0.0076
##     10        0.8051             nan     0.1000    0.0059
##     20        0.7203             nan     0.1000    0.0023
##     40        0.6532             nan     0.1000    0.0010
##     60        0.6264             nan     0.1000    0.0003
##     80        0.6106             nan     0.1000    0.0002
##    100        0.5997             nan     0.1000    0.0001
##    120        0.5906             nan     0.1000    0.0003
##    140        0.5850             nan     0.1000    0.0000
##    150        0.5827             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0592             nan     0.1000    0.0186
##      2        1.0298             nan     0.1000    0.0150
##      3        1.0052             nan     0.1000    0.0122
##      4        0.9850             nan     0.1000    0.0096
##      5        0.9653             nan     0.1000    0.0098
##      6        0.9515             nan     0.1000    0.0069
##      7        0.9350             nan     0.1000    0.0084
##      8        0.9193             nan     0.1000    0.0078
##      9        0.9061             nan     0.1000    0.0064
##     10        0.8932             nan     0.1000    0.0065
##     20        0.8137             nan     0.1000    0.0032
##     40        0.7373             nan     0.1000    0.0012
##     60        0.6957             nan     0.1000    0.0005
##     80        0.6710             nan     0.1000    0.0002
##    100        0.6550             nan     0.1000    0.0002
##    120        0.6428             nan     0.1000    0.0001
##    140        0.6359             nan     0.1000    0.0001
##    150        0.6329             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0436             nan     0.1000    0.0257
##      2        1.0042             nan     0.1000    0.0193
##      3        0.9717             nan     0.1000    0.0160
##      4        0.9436             nan     0.1000    0.0143
##      5        0.9146             nan     0.1000    0.0143
##      6        0.8914             nan     0.1000    0.0116
##      7        0.8756             nan     0.1000    0.0081
##      8        0.8602             nan     0.1000    0.0077
##      9        0.8444             nan     0.1000    0.0082
##     10        0.8330             nan     0.1000    0.0056
##     20        0.7507             nan     0.1000    0.0028
##     40        0.6780             nan     0.1000    0.0010
##     60        0.6461             nan     0.1000    0.0004
##     80        0.6305             nan     0.1000    0.0001
##    100        0.6197             nan     0.1000    0.0002
##    120        0.6113             nan     0.1000    0.0002
##    140        0.6053             nan     0.1000   -0.0000
##    150        0.6026             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0358             nan     0.1000    0.0294
##      2        0.9889             nan     0.1000    0.0229
##      3        0.9511             nan     0.1000    0.0187
##      4        0.9214             nan     0.1000    0.0150
##      5        0.8956             nan     0.1000    0.0126
##      6        0.8706             nan     0.1000    0.0125
##      7        0.8491             nan     0.1000    0.0107
##      8        0.8315             nan     0.1000    0.0089
##      9        0.8155             nan     0.1000    0.0077
##     10        0.8008             nan     0.1000    0.0071
##     20        0.7214             nan     0.1000    0.0030
##     40        0.6546             nan     0.1000    0.0007
##     60        0.6268             nan     0.1000    0.0007
##     80        0.6112             nan     0.1000    0.0002
##    100        0.6013             nan     0.1000    0.0000
##    120        0.5946             nan     0.1000    0.0001
##    140        0.5874             nan     0.1000   -0.0000
##    150        0.5844             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0576             nan     0.1000    0.0187
##      2        1.0276             nan     0.1000    0.0150
##      3        1.0031             nan     0.1000    0.0122
##      4        0.9825             nan     0.1000    0.0102
##      5        0.9649             nan     0.1000    0.0086
##      6        0.9459             nan     0.1000    0.0096
##      7        0.9324             nan     0.1000    0.0069
##      8        0.9166             nan     0.1000    0.0078
##      9        0.9038             nan     0.1000    0.0063
##     10        0.8908             nan     0.1000    0.0064
##     20        0.8132             nan     0.1000    0.0022
##     40        0.7344             nan     0.1000    0.0012
##     60        0.6931             nan     0.1000    0.0010
##     80        0.6686             nan     0.1000    0.0006
##    100        0.6537             nan     0.1000    0.0001
##    120        0.6420             nan     0.1000    0.0002
##    140        0.6340             nan     0.1000    0.0001
##    150        0.6308             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0429             nan     0.1000    0.0259
##      2        1.0035             nan     0.1000    0.0195
##      3        0.9707             nan     0.1000    0.0163
##      4        0.9350             nan     0.1000    0.0179
##      5        0.9069             nan     0.1000    0.0142
##      6        0.8845             nan     0.1000    0.0111
##      7        0.8678             nan     0.1000    0.0085
##      8        0.8520             nan     0.1000    0.0078
##      9        0.8362             nan     0.1000    0.0076
##     10        0.8251             nan     0.1000    0.0054
##     20        0.7463             nan     0.1000    0.0025
##     40        0.6747             nan     0.1000    0.0009
##     60        0.6449             nan     0.1000    0.0003
##     80        0.6289             nan     0.1000    0.0003
##    100        0.6174             nan     0.1000    0.0003
##    120        0.6096             nan     0.1000    0.0000
##    140        0.6039             nan     0.1000    0.0001
##    150        0.6013             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0349             nan     0.1000    0.0302
##      2        0.9895             nan     0.1000    0.0226
##      3        0.9526             nan     0.1000    0.0180
##      4        0.9206             nan     0.1000    0.0162
##      5        0.8944             nan     0.1000    0.0132
##      6        0.8693             nan     0.1000    0.0127
##      7        0.8500             nan     0.1000    0.0097
##      8        0.8300             nan     0.1000    0.0098
##      9        0.8164             nan     0.1000    0.0066
##     10        0.8019             nan     0.1000    0.0074
##     20        0.7178             nan     0.1000    0.0025
##     40        0.6516             nan     0.1000    0.0011
##     60        0.6248             nan     0.1000    0.0003
##     80        0.6090             nan     0.1000    0.0001
##    100        0.6007             nan     0.1000    0.0001
##    120        0.5930             nan     0.1000    0.0001
##    140        0.5872             nan     0.1000    0.0000
##    150        0.5854             nan     0.1000   -0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0585             nan     0.1000    0.0185
##      2        1.0284             nan     0.1000    0.0149
##      3        1.0044             nan     0.1000    0.0121
##      4        0.9838             nan     0.1000    0.0100
##      5        0.9641             nan     0.1000    0.0097
##      6        0.9469             nan     0.1000    0.0084
##      7        0.9340             nan     0.1000    0.0065
##      8        0.9183             nan     0.1000    0.0077
##      9        0.9056             nan     0.1000    0.0063
##     10        0.8959             nan     0.1000    0.0050
##     20        0.8168             nan     0.1000    0.0023
##     40        0.7369             nan     0.1000    0.0011
##     60        0.6944             nan     0.1000    0.0011
##     80        0.6713             nan     0.1000    0.0003
##    100        0.6564             nan     0.1000    0.0005
##    120        0.6449             nan     0.1000    0.0002
##    140        0.6367             nan     0.1000    0.0002
##    150        0.6336             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0446             nan     0.1000    0.0258
##      2        1.0054             nan     0.1000    0.0199
##      3        0.9733             nan     0.1000    0.0157
##      4        0.9378             nan     0.1000    0.0182
##      5        0.9105             nan     0.1000    0.0135
##      6        0.8884             nan     0.1000    0.0112
##      7        0.8701             nan     0.1000    0.0093
##      8        0.8562             nan     0.1000    0.0069
##      9        0.8420             nan     0.1000    0.0071
##     10        0.8314             nan     0.1000    0.0053
##     20        0.7477             nan     0.1000    0.0037
##     40        0.6770             nan     0.1000    0.0010
##     60        0.6477             nan     0.1000    0.0004
##     80        0.6324             nan     0.1000    0.0002
##    100        0.6205             nan     0.1000    0.0003
##    120        0.6122             nan     0.1000    0.0001
##    140        0.6067             nan     0.1000    0.0001
##    150        0.6040             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0359             nan     0.1000    0.0295
##      2        0.9897             nan     0.1000    0.0225
##      3        0.9522             nan     0.1000    0.0184
##      4        0.9207             nan     0.1000    0.0159
##      5        0.8954             nan     0.1000    0.0128
##      6        0.8698             nan     0.1000    0.0126
##      7        0.8508             nan     0.1000    0.0095
##      8        0.8325             nan     0.1000    0.0090
##      9        0.8192             nan     0.1000    0.0066
##     10        0.8046             nan     0.1000    0.0071
##     20        0.7202             nan     0.1000    0.0027
##     40        0.6540             nan     0.1000    0.0009
##     60        0.6274             nan     0.1000    0.0007
##     80        0.6106             nan     0.1000    0.0006
##    100        0.6004             nan     0.1000    0.0001
##    120        0.5948             nan     0.1000    0.0002
##    140        0.5889             nan     0.1000    0.0000
##    150        0.5871             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0585             nan     0.1000    0.0186
##      2        1.0281             nan     0.1000    0.0150
##      3        1.0031             nan     0.1000    0.0122
##      4        0.9828             nan     0.1000    0.0101
##      5        0.9632             nan     0.1000    0.0097
##      6        0.9455             nan     0.1000    0.0087
##      7        0.9324             nan     0.1000    0.0065
##      8        0.9164             nan     0.1000    0.0077
##      9        0.9034             nan     0.1000    0.0065
##     10        0.8940             nan     0.1000    0.0047
##     20        0.8128             nan     0.1000    0.0023
##     40        0.7335             nan     0.1000    0.0013
##     60        0.6927             nan     0.1000    0.0007
##     80        0.6686             nan     0.1000    0.0006
##    100        0.6529             nan     0.1000    0.0004
##    120        0.6424             nan     0.1000    0.0002
##    140        0.6347             nan     0.1000    0.0003
##    150        0.6319             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0423             nan     0.1000    0.0261
##      2        1.0025             nan     0.1000    0.0200
##      3        0.9693             nan     0.1000    0.0165
##      4        0.9426             nan     0.1000    0.0131
##      5        0.9136             nan     0.1000    0.0145
##      6        0.8932             nan     0.1000    0.0097
##      7        0.8724             nan     0.1000    0.0105
##      8        0.8561             nan     0.1000    0.0081
##      9        0.8425             nan     0.1000    0.0067
##     10        0.8292             nan     0.1000    0.0066
##     20        0.7465             nan     0.1000    0.0028
##     40        0.6750             nan     0.1000    0.0012
##     60        0.6458             nan     0.1000    0.0004
##     80        0.6298             nan     0.1000    0.0004
##    100        0.6182             nan     0.1000    0.0001
##    120        0.6098             nan     0.1000    0.0001
##    140        0.6043             nan     0.1000    0.0001
##    150        0.6015             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0353             nan     0.1000    0.0298
##      2        0.9874             nan     0.1000    0.0236
##      3        0.9512             nan     0.1000    0.0184
##      4        0.9188             nan     0.1000    0.0161
##      5        0.8936             nan     0.1000    0.0126
##      6        0.8708             nan     0.1000    0.0115
##      7        0.8492             nan     0.1000    0.0109
##      8        0.8310             nan     0.1000    0.0091
##      9        0.8173             nan     0.1000    0.0064
##     10        0.8047             nan     0.1000    0.0059
##     20        0.7172             nan     0.1000    0.0028
##     40        0.6512             nan     0.1000    0.0008
##     60        0.6241             nan     0.1000    0.0003
##     80        0.6086             nan     0.1000    0.0002
##    100        0.5981             nan     0.1000    0.0001
##    120        0.5908             nan     0.1000    0.0000
##    140        0.5852             nan     0.1000   -0.0000
##    150        0.5821             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0575             nan     0.1000    0.0187
##      2        1.0272             nan     0.1000    0.0150
##      3        1.0024             nan     0.1000    0.0122
##      4        0.9822             nan     0.1000    0.0105
##      5        0.9626             nan     0.1000    0.0098
##      6        0.9460             nan     0.1000    0.0086
##      7        0.9326             nan     0.1000    0.0065
##      8        0.9172             nan     0.1000    0.0078
##      9        0.9071             nan     0.1000    0.0049
##     10        0.8950             nan     0.1000    0.0063
##     20        0.8139             nan     0.1000    0.0035
##     40        0.7337             nan     0.1000    0.0013
##     60        0.6914             nan     0.1000    0.0005
##     80        0.6682             nan     0.1000    0.0003
##    100        0.6524             nan     0.1000    0.0003
##    120        0.6411             nan     0.1000    0.0001
##    140        0.6333             nan     0.1000    0.0000
##    150        0.6299             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0430             nan     0.1000    0.0259
##      2        1.0030             nan     0.1000    0.0199
##      3        0.9696             nan     0.1000    0.0169
##      4        0.9430             nan     0.1000    0.0135
##      5        0.9136             nan     0.1000    0.0147
##      6        0.8898             nan     0.1000    0.0117
##      7        0.8736             nan     0.1000    0.0078
##      8        0.8557             nan     0.1000    0.0091
##      9        0.8404             nan     0.1000    0.0077
##     10        0.8298             nan     0.1000    0.0054
##     20        0.7470             nan     0.1000    0.0028
##     40        0.6753             nan     0.1000    0.0013
##     60        0.6450             nan     0.1000    0.0003
##     80        0.6280             nan     0.1000    0.0002
##    100        0.6180             nan     0.1000    0.0001
##    120        0.6085             nan     0.1000    0.0001
##    140        0.6018             nan     0.1000    0.0000
##    150        0.5996             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0352             nan     0.1000    0.0296
##      2        0.9878             nan     0.1000    0.0240
##      3        0.9509             nan     0.1000    0.0184
##      4        0.9194             nan     0.1000    0.0158
##      5        0.8918             nan     0.1000    0.0134
##      6        0.8694             nan     0.1000    0.0109
##      7        0.8477             nan     0.1000    0.0110
##      8        0.8294             nan     0.1000    0.0091
##      9        0.8138             nan     0.1000    0.0079
##     10        0.8018             nan     0.1000    0.0057
##     20        0.7190             nan     0.1000    0.0028
##     40        0.6512             nan     0.1000    0.0012
##     60        0.6243             nan     0.1000    0.0005
##     80        0.6076             nan     0.1000    0.0004
##    100        0.5968             nan     0.1000    0.0001
##    120        0.5889             nan     0.1000    0.0001
##    140        0.5834             nan     0.1000    0.0002
##    150        0.5817             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0581             nan     0.1000    0.0186
##      2        1.0282             nan     0.1000    0.0150
##      3        1.0040             nan     0.1000    0.0122
##      4        0.9841             nan     0.1000    0.0103
##      5        0.9644             nan     0.1000    0.0098
##      6        0.9474             nan     0.1000    0.0083
##      7        0.9342             nan     0.1000    0.0064
##      8        0.9184             nan     0.1000    0.0078
##      9        0.9084             nan     0.1000    0.0048
##     10        0.8962             nan     0.1000    0.0061
##     20        0.8163             nan     0.1000    0.0021
##     40        0.7363             nan     0.1000    0.0010
##     60        0.6969             nan     0.1000    0.0007
##     80        0.6712             nan     0.1000    0.0003
##    100        0.6562             nan     0.1000    0.0002
##    120        0.6452             nan     0.1000    0.0002
##    140        0.6368             nan     0.1000    0.0002
##    150        0.6339             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0438             nan     0.1000    0.0257
##      2        1.0041             nan     0.1000    0.0199
##      3        0.9707             nan     0.1000    0.0170
##      4        0.9429             nan     0.1000    0.0135
##      5        0.9144             nan     0.1000    0.0142
##      6        0.8909             nan     0.1000    0.0118
##      7        0.8714             nan     0.1000    0.0097
##      8        0.8559             nan     0.1000    0.0079
##      9        0.8424             nan     0.1000    0.0066
##     10        0.8310             nan     0.1000    0.0058
##     20        0.7501             nan     0.1000    0.0026
##     40        0.6785             nan     0.1000    0.0012
##     60        0.6480             nan     0.1000    0.0008
##     80        0.6321             nan     0.1000    0.0003
##    100        0.6207             nan     0.1000    0.0001
##    120        0.6116             nan     0.1000    0.0002
##    140        0.6055             nan     0.1000    0.0001
##    150        0.6028             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0369             nan     0.1000    0.0297
##      2        0.9907             nan     0.1000    0.0226
##      3        0.9542             nan     0.1000    0.0182
##      4        0.9221             nan     0.1000    0.0163
##      5        0.8961             nan     0.1000    0.0130
##      6        0.8735             nan     0.1000    0.0113
##      7        0.8526             nan     0.1000    0.0106
##      8        0.8377             nan     0.1000    0.0073
##      9        0.8206             nan     0.1000    0.0084
##     10        0.8074             nan     0.1000    0.0064
##     20        0.7236             nan     0.1000    0.0031
##     40        0.6547             nan     0.1000    0.0014
##     60        0.6273             nan     0.1000    0.0007
##     80        0.6112             nan     0.1000    0.0001
##    100        0.6005             nan     0.1000    0.0002
##    120        0.5920             nan     0.1000    0.0001
##    140        0.5869             nan     0.1000    0.0000
##    150        0.5849             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0586             nan     0.1000    0.0186
##      2        1.0287             nan     0.1000    0.0150
##      3        1.0041             nan     0.1000    0.0122
##      4        0.9834             nan     0.1000    0.0108
##      5        0.9636             nan     0.1000    0.0097
##      6        0.9494             nan     0.1000    0.0073
##      7        0.9331             nan     0.1000    0.0083
##      8        0.9181             nan     0.1000    0.0077
##      9        0.9080             nan     0.1000    0.0048
##     10        0.8958             nan     0.1000    0.0061
##     20        0.8157             nan     0.1000    0.0035
##     40        0.7360             nan     0.1000    0.0012
##     60        0.6960             nan     0.1000    0.0004
##     80        0.6705             nan     0.1000    0.0005
##    100        0.6541             nan     0.1000    0.0004
##    120        0.6441             nan     0.1000    0.0002
##    140        0.6366             nan     0.1000    0.0001
##    150        0.6339             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0443             nan     0.1000    0.0258
##      2        1.0042             nan     0.1000    0.0196
##      3        0.9711             nan     0.1000    0.0171
##      4        0.9450             nan     0.1000    0.0129
##      5        0.9211             nan     0.1000    0.0117
##      6        0.8969             nan     0.1000    0.0125
##      7        0.8779             nan     0.1000    0.0093
##      8        0.8594             nan     0.1000    0.0091
##      9        0.8441             nan     0.1000    0.0079
##     10        0.8328             nan     0.1000    0.0056
##     20        0.7503             nan     0.1000    0.0033
##     40        0.6779             nan     0.1000    0.0015
##     60        0.6474             nan     0.1000    0.0004
##     80        0.6305             nan     0.1000    0.0003
##    100        0.6201             nan     0.1000    0.0001
##    120        0.6111             nan     0.1000    0.0003
##    140        0.6043             nan     0.1000    0.0000
##    150        0.6017             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0369             nan     0.1000    0.0294
##      2        0.9899             nan     0.1000    0.0237
##      3        0.9518             nan     0.1000    0.0188
##      4        0.9206             nan     0.1000    0.0156
##      5        0.8949             nan     0.1000    0.0127
##      6        0.8723             nan     0.1000    0.0111
##      7        0.8503             nan     0.1000    0.0109
##      8        0.8325             nan     0.1000    0.0089
##      9        0.8171             nan     0.1000    0.0077
##     10        0.8031             nan     0.1000    0.0070
##     20        0.7235             nan     0.1000    0.0023
##     40        0.6550             nan     0.1000    0.0007
##     60        0.6273             nan     0.1000    0.0003
##     80        0.6102             nan     0.1000    0.0001
##    100        0.5990             nan     0.1000    0.0001
##    120        0.5915             nan     0.1000   -0.0000
##    140        0.5858             nan     0.1000    0.0000
##    150        0.5830             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0582             nan     0.1000    0.0187
##      2        1.0284             nan     0.1000    0.0151
##      3        1.0039             nan     0.1000    0.0123
##      4        0.9837             nan     0.1000    0.0098
##      5        0.9638             nan     0.1000    0.0098
##      6        0.9498             nan     0.1000    0.0069
##      7        0.9337             nan     0.1000    0.0083
##      8        0.9182             nan     0.1000    0.0078
##      9        0.9053             nan     0.1000    0.0065
##     10        0.8959             nan     0.1000    0.0047
##     20        0.8123             nan     0.1000    0.0033
##     40        0.7337             nan     0.1000    0.0011
##     60        0.6934             nan     0.1000    0.0005
##     80        0.6691             nan     0.1000    0.0003
##    100        0.6528             nan     0.1000    0.0004
##    120        0.6416             nan     0.1000    0.0004
##    140        0.6335             nan     0.1000    0.0001
##    150        0.6304             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0435             nan     0.1000    0.0261
##      2        1.0035             nan     0.1000    0.0197
##      3        0.9701             nan     0.1000    0.0166
##      4        0.9427             nan     0.1000    0.0136
##      5        0.9131             nan     0.1000    0.0145
##      6        0.8897             nan     0.1000    0.0117
##      7        0.8734             nan     0.1000    0.0080
##      8        0.8580             nan     0.1000    0.0076
##      9        0.8417             nan     0.1000    0.0082
##     10        0.8302             nan     0.1000    0.0055
##     20        0.7468             nan     0.1000    0.0033
##     40        0.6754             nan     0.1000    0.0011
##     60        0.6448             nan     0.1000    0.0004
##     80        0.6291             nan     0.1000    0.0001
##    100        0.6183             nan     0.1000    0.0004
##    120        0.6104             nan     0.1000    0.0001
##    140        0.6036             nan     0.1000    0.0001
##    150        0.6006             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0357             nan     0.1000    0.0296
##      2        0.9880             nan     0.1000    0.0236
##      3        0.9496             nan     0.1000    0.0187
##      4        0.9196             nan     0.1000    0.0149
##      5        0.8900             nan     0.1000    0.0145
##      6        0.8687             nan     0.1000    0.0109
##      7        0.8509             nan     0.1000    0.0087
##      8        0.8320             nan     0.1000    0.0095
##      9        0.8162             nan     0.1000    0.0077
##     10        0.8012             nan     0.1000    0.0072
##     20        0.7207             nan     0.1000    0.0023
##     40        0.6512             nan     0.1000    0.0009
##     60        0.6241             nan     0.1000    0.0004
##     80        0.6084             nan     0.1000    0.0003
##    100        0.5982             nan     0.1000    0.0000
##    120        0.5908             nan     0.1000    0.0000
##    140        0.5848             nan     0.1000    0.0000
##    150        0.5826             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0574             nan     0.1000    0.0188
##      2        1.0276             nan     0.1000    0.0151
##      3        1.0030             nan     0.1000    0.0123
##      4        0.9830             nan     0.1000    0.0103
##      5        0.9633             nan     0.1000    0.0099
##      6        0.9457             nan     0.1000    0.0086
##      7        0.9294             nan     0.1000    0.0080
##      8        0.9166             nan     0.1000    0.0067
##      9        0.9067             nan     0.1000    0.0048
##     10        0.8939             nan     0.1000    0.0063
##     20        0.8124             nan     0.1000    0.0033
##     40        0.7343             nan     0.1000    0.0009
##     60        0.6920             nan     0.1000    0.0010
##     80        0.6676             nan     0.1000    0.0005
##    100        0.6515             nan     0.1000    0.0001
##    120        0.6409             nan     0.1000    0.0001
##    140        0.6343             nan     0.1000    0.0001
##    150        0.6309             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0431             nan     0.1000    0.0264
##      2        1.0034             nan     0.1000    0.0201
##      3        0.9701             nan     0.1000    0.0165
##      4        0.9434             nan     0.1000    0.0135
##      5        0.9142             nan     0.1000    0.0145
##      6        0.8907             nan     0.1000    0.0116
##      7        0.8747             nan     0.1000    0.0080
##      8        0.8568             nan     0.1000    0.0090
##      9        0.8423             nan     0.1000    0.0071
##     10        0.8284             nan     0.1000    0.0068
##     20        0.7486             nan     0.1000    0.0029
##     40        0.6760             nan     0.1000    0.0007
##     60        0.6452             nan     0.1000    0.0006
##     80        0.6282             nan     0.1000    0.0002
##    100        0.6176             nan     0.1000    0.0002
##    120        0.6083             nan     0.1000    0.0001
##    140        0.6019             nan     0.1000    0.0002
##    150        0.5993             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0361             nan     0.1000    0.0300
##      2        0.9890             nan     0.1000    0.0234
##      3        0.9509             nan     0.1000    0.0192
##      4        0.9192             nan     0.1000    0.0156
##      5        0.8924             nan     0.1000    0.0134
##      6        0.8673             nan     0.1000    0.0124
##      7        0.8462             nan     0.1000    0.0105
##      8        0.8284             nan     0.1000    0.0087
##      9        0.8158             nan     0.1000    0.0062
##     10        0.8015             nan     0.1000    0.0071
##     20        0.7204             nan     0.1000    0.0023
##     40        0.6528             nan     0.1000    0.0014
##     60        0.6258             nan     0.1000    0.0006
##     80        0.6083             nan     0.1000    0.0004
##    100        0.5978             nan     0.1000    0.0001
##    120        0.5906             nan     0.1000    0.0000
##    140        0.5846             nan     0.1000    0.0001
##    150        0.5824             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0572             nan     0.1000    0.0186
##      2        1.0275             nan     0.1000    0.0150
##      3        1.0031             nan     0.1000    0.0122
##      4        0.9827             nan     0.1000    0.0104
##      5        0.9629             nan     0.1000    0.0098
##      6        0.9463             nan     0.1000    0.0083
##      7        0.9334             nan     0.1000    0.0066
##      8        0.9176             nan     0.1000    0.0078
##      9        0.9049             nan     0.1000    0.0064
##     10        0.8952             nan     0.1000    0.0048
##     20        0.8142             nan     0.1000    0.0023
##     40        0.7339             nan     0.1000    0.0011
##     60        0.6939             nan     0.1000    0.0008
##     80        0.6688             nan     0.1000    0.0004
##    100        0.6532             nan     0.1000    0.0003
##    120        0.6423             nan     0.1000    0.0002
##    140        0.6339             nan     0.1000    0.0002
##    150        0.6307             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0438             nan     0.1000    0.0260
##      2        1.0035             nan     0.1000    0.0200
##      3        0.9718             nan     0.1000    0.0158
##      4        0.9361             nan     0.1000    0.0179
##      5        0.9138             nan     0.1000    0.0108
##      6        0.8888             nan     0.1000    0.0124
##      7        0.8687             nan     0.1000    0.0102
##      8        0.8533             nan     0.1000    0.0080
##      9        0.8379             nan     0.1000    0.0076
##     10        0.8271             nan     0.1000    0.0053
##     20        0.7450             nan     0.1000    0.0027
##     40        0.6754             nan     0.1000    0.0012
##     60        0.6454             nan     0.1000    0.0006
##     80        0.6296             nan     0.1000    0.0004
##    100        0.6184             nan     0.1000    0.0003
##    120        0.6104             nan     0.1000    0.0002
##    140        0.6037             nan     0.1000    0.0001
##    150        0.6008             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0355             nan     0.1000    0.0296
##      2        0.9891             nan     0.1000    0.0231
##      3        0.9519             nan     0.1000    0.0185
##      4        0.9195             nan     0.1000    0.0161
##      5        0.8945             nan     0.1000    0.0125
##      6        0.8685             nan     0.1000    0.0128
##      7        0.8497             nan     0.1000    0.0092
##      8        0.8311             nan     0.1000    0.0094
##      9        0.8154             nan     0.1000    0.0077
##     10        0.8038             nan     0.1000    0.0054
##     20        0.7178             nan     0.1000    0.0025
##     40        0.6533             nan     0.1000    0.0009
##     60        0.6251             nan     0.1000    0.0004
##     80        0.6081             nan     0.1000    0.0002
##    100        0.5981             nan     0.1000    0.0002
##    120        0.5905             nan     0.1000    0.0000
##    140        0.5843             nan     0.1000    0.0001
##    150        0.5818             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0584             nan     0.1000    0.0185
##      2        1.0287             nan     0.1000    0.0149
##      3        1.0044             nan     0.1000    0.0121
##      4        0.9847             nan     0.1000    0.0101
##      5        0.9648             nan     0.1000    0.0097
##      6        0.9511             nan     0.1000    0.0064
##      7        0.9345             nan     0.1000    0.0084
##      8        0.9190             nan     0.1000    0.0077
##      9        0.9057             nan     0.1000    0.0064
##     10        0.8960             nan     0.1000    0.0048
##     20        0.8141             nan     0.1000    0.0033
##     40        0.7376             nan     0.1000    0.0008
##     60        0.6960             nan     0.1000    0.0005
##     80        0.6717             nan     0.1000    0.0006
##    100        0.6565             nan     0.1000    0.0002
##    120        0.6454             nan     0.1000    0.0003
##    140        0.6382             nan     0.1000    0.0001
##    150        0.6344             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0437             nan     0.1000    0.0259
##      2        1.0052             nan     0.1000    0.0195
##      3        0.9719             nan     0.1000    0.0162
##      4        0.9444             nan     0.1000    0.0137
##      5        0.9153             nan     0.1000    0.0148
##      6        0.8919             nan     0.1000    0.0117
##      7        0.8742             nan     0.1000    0.0086
##      8        0.8592             nan     0.1000    0.0074
##      9        0.8433             nan     0.1000    0.0078
##     10        0.8323             nan     0.1000    0.0056
##     20        0.7503             nan     0.1000    0.0025
##     40        0.6799             nan     0.1000    0.0008
##     60        0.6488             nan     0.1000    0.0005
##     80        0.6319             nan     0.1000    0.0001
##    100        0.6208             nan     0.1000    0.0001
##    120        0.6118             nan     0.1000    0.0003
##    140        0.6059             nan     0.1000   -0.0000
##    150        0.6034             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0363             nan     0.1000    0.0298
##      2        0.9897             nan     0.1000    0.0234
##      3        0.9530             nan     0.1000    0.0186
##      4        0.9199             nan     0.1000    0.0161
##      5        0.8952             nan     0.1000    0.0124
##      6        0.8720             nan     0.1000    0.0115
##      7        0.8501             nan     0.1000    0.0109
##      8        0.8322             nan     0.1000    0.0088
##      9        0.8164             nan     0.1000    0.0078
##     10        0.8029             nan     0.1000    0.0068
##     20        0.7224             nan     0.1000    0.0029
##     40        0.6556             nan     0.1000    0.0007
##     60        0.6280             nan     0.1000    0.0004
##     80        0.6121             nan     0.1000    0.0001
##    100        0.6018             nan     0.1000    0.0000
##    120        0.5931             nan     0.1000    0.0001
##    140        0.5879             nan     0.1000   -0.0000
##    150        0.5861             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0581             nan     0.1000    0.0184
##      2        1.0284             nan     0.1000    0.0148
##      3        1.0044             nan     0.1000    0.0120
##      4        0.9838             nan     0.1000    0.0106
##      5        0.9656             nan     0.1000    0.0087
##      6        0.9468             nan     0.1000    0.0095
##      7        0.9310             nan     0.1000    0.0078
##      8        0.9179             nan     0.1000    0.0065
##      9        0.9050             nan     0.1000    0.0063
##     10        0.8952             nan     0.1000    0.0046
##     20        0.8144             nan     0.1000    0.0025
##     40        0.7359             nan     0.1000    0.0012
##     60        0.6947             nan     0.1000    0.0009
##     80        0.6708             nan     0.1000    0.0005
##    100        0.6542             nan     0.1000    0.0004
##    120        0.6449             nan     0.1000    0.0001
##    140        0.6372             nan     0.1000    0.0001
##    150        0.6328             nan     0.1000    0.0003
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0436             nan     0.1000    0.0259
##      2        1.0040             nan     0.1000    0.0198
##      3        0.9711             nan     0.1000    0.0163
##      4        0.9384             nan     0.1000    0.0164
##      5        0.9154             nan     0.1000    0.0117
##      6        0.8918             nan     0.1000    0.0117
##      7        0.8724             nan     0.1000    0.0095
##      8        0.8574             nan     0.1000    0.0074
##      9        0.8454             nan     0.1000    0.0060
##     10        0.8326             nan     0.1000    0.0064
##     20        0.7499             nan     0.1000    0.0028
##     40        0.6778             nan     0.1000    0.0010
##     60        0.6477             nan     0.1000    0.0005
##     80        0.6313             nan     0.1000    0.0002
##    100        0.6204             nan     0.1000    0.0003
##    120        0.6120             nan     0.1000   -0.0000
##    140        0.6050             nan     0.1000    0.0000
##    150        0.6022             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0363             nan     0.1000    0.0297
##      2        0.9889             nan     0.1000    0.0239
##      3        0.9512             nan     0.1000    0.0189
##      4        0.9196             nan     0.1000    0.0158
##      5        0.8946             nan     0.1000    0.0127
##      6        0.8696             nan     0.1000    0.0124
##      7        0.8480             nan     0.1000    0.0107
##      8        0.8316             nan     0.1000    0.0080
##      9        0.8158             nan     0.1000    0.0077
##     10        0.8043             nan     0.1000    0.0059
##     20        0.7193             nan     0.1000    0.0025
##     40        0.6533             nan     0.1000    0.0010
##     60        0.6241             nan     0.1000    0.0003
##     80        0.6096             nan     0.1000    0.0000
##    100        0.5982             nan     0.1000    0.0000
##    120        0.5914             nan     0.1000    0.0002
##    140        0.5852             nan     0.1000    0.0000
##    150        0.5826             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0583             nan     0.1000    0.0188
##      2        1.0281             nan     0.1000    0.0151
##      3        1.0037             nan     0.1000    0.0123
##      4        0.9834             nan     0.1000    0.0106
##      5        0.9636             nan     0.1000    0.0098
##      6        0.9467             nan     0.1000    0.0084
##      7        0.9308             nan     0.1000    0.0080
##      8        0.9182             nan     0.1000    0.0065
##      9        0.9084             nan     0.1000    0.0049
##     10        0.8955             nan     0.1000    0.0061
##     20        0.8138             nan     0.1000    0.0033
##     40        0.7363             nan     0.1000    0.0014
##     60        0.6956             nan     0.1000    0.0006
##     80        0.6707             nan     0.1000    0.0006
##    100        0.6552             nan     0.1000    0.0001
##    120        0.6443             nan     0.1000    0.0001
##    140        0.6358             nan     0.1000    0.0003
##    150        0.6327             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0440             nan     0.1000    0.0258
##      2        1.0039             nan     0.1000    0.0197
##      3        0.9708             nan     0.1000    0.0165
##      4        0.9436             nan     0.1000    0.0137
##      5        0.9146             nan     0.1000    0.0150
##      6        0.8911             nan     0.1000    0.0116
##      7        0.8738             nan     0.1000    0.0088
##      8        0.8578             nan     0.1000    0.0076
##      9        0.8431             nan     0.1000    0.0074
##     10        0.8311             nan     0.1000    0.0061
##     20        0.7490             nan     0.1000    0.0028
##     40        0.6753             nan     0.1000    0.0010
##     60        0.6460             nan     0.1000    0.0006
##     80        0.6300             nan     0.1000    0.0005
##    100        0.6200             nan     0.1000    0.0002
##    120        0.6123             nan     0.1000    0.0001
##    140        0.6060             nan     0.1000    0.0000
##    150        0.6040             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0358             nan     0.1000    0.0296
##      2        0.9896             nan     0.1000    0.0231
##      3        0.9522             nan     0.1000    0.0185
##      4        0.9201             nan     0.1000    0.0156
##      5        0.8948             nan     0.1000    0.0122
##      6        0.8719             nan     0.1000    0.0114
##      7        0.8495             nan     0.1000    0.0110
##      8        0.8342             nan     0.1000    0.0076
##      9        0.8186             nan     0.1000    0.0077
##     10        0.8040             nan     0.1000    0.0072
##     20        0.7215             nan     0.1000    0.0028
##     40        0.6528             nan     0.1000    0.0010
##     60        0.6264             nan     0.1000    0.0005
##     80        0.6113             nan     0.1000    0.0002
##    100        0.6006             nan     0.1000    0.0000
##    120        0.5931             nan     0.1000    0.0001
##    140        0.5875             nan     0.1000    0.0001
##    150        0.5847             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0188
##      2        1.0279             nan     0.1000    0.0151
##      3        1.0034             nan     0.1000    0.0123
##      4        0.9833             nan     0.1000    0.0100
##      5        0.9639             nan     0.1000    0.0099
##      6        0.9502             nan     0.1000    0.0066
##      7        0.9342             nan     0.1000    0.0081
##      8        0.9181             nan     0.1000    0.0079
##      9        0.9079             nan     0.1000    0.0050
##     10        0.8957             nan     0.1000    0.0061
##     20        0.8142             nan     0.1000    0.0035
##     40        0.7354             nan     0.1000    0.0015
##     60        0.6951             nan     0.1000    0.0007
##     80        0.6706             nan     0.1000    0.0006
##    100        0.6558             nan     0.1000    0.0001
##    120        0.6449             nan     0.1000    0.0002
##    140        0.6370             nan     0.1000    0.0001
##    150        0.6332             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0432             nan     0.1000    0.0262
##      2        1.0034             nan     0.1000    0.0203
##      3        0.9696             nan     0.1000    0.0167
##      4        0.9425             nan     0.1000    0.0135
##      5        0.9129             nan     0.1000    0.0146
##      6        0.8895             nan     0.1000    0.0119
##      7        0.8698             nan     0.1000    0.0097
##      8        0.8554             nan     0.1000    0.0072
##      9        0.8420             nan     0.1000    0.0066
##     10        0.8305             nan     0.1000    0.0054
##     20        0.7470             nan     0.1000    0.0033
##     40        0.6784             nan     0.1000    0.0008
##     60        0.6480             nan     0.1000    0.0006
##     80        0.6325             nan     0.1000    0.0003
##    100        0.6210             nan     0.1000    0.0001
##    120        0.6125             nan     0.1000    0.0001
##    140        0.6050             nan     0.1000    0.0001
##    150        0.6024             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0364             nan     0.1000    0.0295
##      2        0.9885             nan     0.1000    0.0235
##      3        0.9515             nan     0.1000    0.0184
##      4        0.9198             nan     0.1000    0.0156
##      5        0.8932             nan     0.1000    0.0133
##      6        0.8708             nan     0.1000    0.0109
##      7        0.8487             nan     0.1000    0.0110
##      8        0.8309             nan     0.1000    0.0089
##      9        0.8148             nan     0.1000    0.0077
##     10        0.8006             nan     0.1000    0.0070
##     20        0.7194             nan     0.1000    0.0029
##     40        0.6530             nan     0.1000    0.0008
##     60        0.6261             nan     0.1000    0.0003
##     80        0.6102             nan     0.1000    0.0002
##    100        0.5983             nan     0.1000    0.0001
##    120        0.5909             nan     0.1000    0.0002
##    140        0.5858             nan     0.1000    0.0001
##    150        0.5838             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0587             nan     0.1000    0.0184
##      2        1.0290             nan     0.1000    0.0149
##      3        1.0071             nan     0.1000    0.0107
##      4        0.9834             nan     0.1000    0.0118
##      5        0.9639             nan     0.1000    0.0097
##      6        0.9462             nan     0.1000    0.0085
##      7        0.9330             nan     0.1000    0.0063
##      8        0.9175             nan     0.1000    0.0077
##      9        0.9048             nan     0.1000    0.0064
##     10        0.8952             nan     0.1000    0.0047
##     20        0.8129             nan     0.1000    0.0032
##     40        0.7380             nan     0.1000    0.0011
##     60        0.6947             nan     0.1000    0.0007
##     80        0.6707             nan     0.1000    0.0003
##    100        0.6557             nan     0.1000    0.0002
##    120        0.6441             nan     0.1000    0.0001
##    140        0.6367             nan     0.1000    0.0001
##    150        0.6336             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0436             nan     0.1000    0.0255
##      2        1.0047             nan     0.1000    0.0197
##      3        0.9717             nan     0.1000    0.0163
##      4        0.9457             nan     0.1000    0.0132
##      5        0.9168             nan     0.1000    0.0148
##      6        0.8932             nan     0.1000    0.0115
##      7        0.8741             nan     0.1000    0.0097
##      8        0.8597             nan     0.1000    0.0073
##      9        0.8445             nan     0.1000    0.0075
##     10        0.8336             nan     0.1000    0.0055
##     20        0.7530             nan     0.1000    0.0025
##     40        0.6792             nan     0.1000    0.0011
##     60        0.6486             nan     0.1000    0.0003
##     80        0.6326             nan     0.1000    0.0003
##    100        0.6219             nan     0.1000    0.0001
##    120        0.6130             nan     0.1000    0.0003
##    140        0.6068             nan     0.1000    0.0000
##    150        0.6042             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0365             nan     0.1000    0.0289
##      2        0.9893             nan     0.1000    0.0234
##      3        0.9526             nan     0.1000    0.0182
##      4        0.9207             nan     0.1000    0.0157
##      5        0.8957             nan     0.1000    0.0125
##      6        0.8730             nan     0.1000    0.0115
##      7        0.8517             nan     0.1000    0.0106
##      8        0.8335             nan     0.1000    0.0091
##      9        0.8206             nan     0.1000    0.0062
##     10        0.8062             nan     0.1000    0.0073
##     20        0.7205             nan     0.1000    0.0031
##     40        0.6538             nan     0.1000    0.0010
##     60        0.6262             nan     0.1000    0.0004
##     80        0.6103             nan     0.1000    0.0002
##    100        0.6012             nan     0.1000    0.0002
##    120        0.5944             nan     0.1000    0.0000
##    140        0.5887             nan     0.1000   -0.0001
##    150        0.5867             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0586             nan     0.1000    0.0186
##      2        1.0285             nan     0.1000    0.0149
##      3        1.0042             nan     0.1000    0.0122
##      4        0.9841             nan     0.1000    0.0103
##      5        0.9649             nan     0.1000    0.0097
##      6        0.9481             nan     0.1000    0.0085
##      7        0.9350             nan     0.1000    0.0066
##      8        0.9197             nan     0.1000    0.0078
##      9        0.9096             nan     0.1000    0.0051
##     10        0.8968             nan     0.1000    0.0062
##     20        0.8160             nan     0.1000    0.0022
##     40        0.7366             nan     0.1000    0.0017
##     60        0.6954             nan     0.1000    0.0005
##     80        0.6704             nan     0.1000    0.0004
##    100        0.6561             nan     0.1000    0.0001
##    120        0.6443             nan     0.1000    0.0001
##    140        0.6369             nan     0.1000    0.0002
##    150        0.6338             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0431             nan     0.1000    0.0261
##      2        1.0040             nan     0.1000    0.0196
##      3        0.9708             nan     0.1000    0.0167
##      4        0.9383             nan     0.1000    0.0162
##      5        0.9146             nan     0.1000    0.0118
##      6        0.8912             nan     0.1000    0.0117
##      7        0.8716             nan     0.1000    0.0094
##      8        0.8574             nan     0.1000    0.0072
##      9        0.8456             nan     0.1000    0.0060
##     10        0.8325             nan     0.1000    0.0064
##     20        0.7490             nan     0.1000    0.0029
##     40        0.6783             nan     0.1000    0.0010
##     60        0.6478             nan     0.1000    0.0003
##     80        0.6314             nan     0.1000    0.0002
##    100        0.6203             nan     0.1000    0.0001
##    120        0.6121             nan     0.1000    0.0002
##    140        0.6067             nan     0.1000    0.0000
##    150        0.6043             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0372             nan     0.1000    0.0296
##      2        0.9901             nan     0.1000    0.0232
##      3        0.9525             nan     0.1000    0.0187
##      4        0.9203             nan     0.1000    0.0162
##      5        0.8944             nan     0.1000    0.0131
##      6        0.8728             nan     0.1000    0.0106
##      7        0.8516             nan     0.1000    0.0105
##      8        0.8329             nan     0.1000    0.0096
##      9        0.8171             nan     0.1000    0.0078
##     10        0.8041             nan     0.1000    0.0064
##     20        0.7197             nan     0.1000    0.0027
##     40        0.6545             nan     0.1000    0.0011
##     60        0.6279             nan     0.1000    0.0005
##     80        0.6134             nan     0.1000    0.0002
##    100        0.6028             nan     0.1000    0.0001
##    120        0.5956             nan     0.1000    0.0000
##    140        0.5900             nan     0.1000   -0.0000
##    150        0.5880             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0583             nan     0.1000    0.0187
##      2        1.0283             nan     0.1000    0.0151
##      3        1.0040             nan     0.1000    0.0123
##      4        0.9833             nan     0.1000    0.0104
##      5        0.9633             nan     0.1000    0.0098
##      6        0.9459             nan     0.1000    0.0086
##      7        0.9326             nan     0.1000    0.0067
##      8        0.9173             nan     0.1000    0.0078
##      9        0.9070             nan     0.1000    0.0051
##     10        0.8947             nan     0.1000    0.0062
##     20        0.8133             nan     0.1000    0.0035
##     40        0.7342             nan     0.1000    0.0016
##     60        0.6933             nan     0.1000    0.0010
##     80        0.6695             nan     0.1000    0.0004
##    100        0.6545             nan     0.1000    0.0001
##    120        0.6437             nan     0.1000    0.0002
##    140        0.6354             nan     0.1000    0.0002
##    150        0.6325             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0439             nan     0.1000    0.0262
##      2        1.0042             nan     0.1000    0.0202
##      3        0.9703             nan     0.1000    0.0168
##      4        0.9436             nan     0.1000    0.0133
##      5        0.9142             nan     0.1000    0.0147
##      6        0.8908             nan     0.1000    0.0118
##      7        0.8712             nan     0.1000    0.0098
##      8        0.8553             nan     0.1000    0.0079
##      9        0.8414             nan     0.1000    0.0070
##     10        0.8302             nan     0.1000    0.0056
##     20        0.7469             nan     0.1000    0.0028
##     40        0.6757             nan     0.1000    0.0012
##     60        0.6457             nan     0.1000    0.0003
##     80        0.6302             nan     0.1000    0.0001
##    100        0.6188             nan     0.1000    0.0001
##    120        0.6108             nan     0.1000    0.0000
##    140        0.6046             nan     0.1000    0.0001
##    150        0.6019             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0364             nan     0.1000    0.0298
##      2        0.9891             nan     0.1000    0.0235
##      3        0.9524             nan     0.1000    0.0184
##      4        0.9210             nan     0.1000    0.0158
##      5        0.8941             nan     0.1000    0.0135
##      6        0.8717             nan     0.1000    0.0111
##      7        0.8505             nan     0.1000    0.0109
##      8        0.8316             nan     0.1000    0.0094
##      9        0.8149             nan     0.1000    0.0082
##     10        0.8036             nan     0.1000    0.0052
##     20        0.7208             nan     0.1000    0.0029
##     40        0.6531             nan     0.1000    0.0008
##     60        0.6252             nan     0.1000    0.0003
##     80        0.6084             nan     0.1000    0.0002
##    100        0.5987             nan     0.1000    0.0000
##    120        0.5916             nan     0.1000    0.0001
##    140        0.5872             nan     0.1000    0.0001
##    150        0.5848             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0581             nan     0.1000    0.0185
##      2        1.0288             nan     0.1000    0.0150
##      3        1.0042             nan     0.1000    0.0122
##      4        0.9838             nan     0.1000    0.0099
##      5        0.9646             nan     0.1000    0.0090
##      6        0.9503             nan     0.1000    0.0072
##      7        0.9337             nan     0.1000    0.0083
##      8        0.9201             nan     0.1000    0.0064
##      9        0.9048             nan     0.1000    0.0076
##     10        0.8919             nan     0.1000    0.0063
##     20        0.8142             nan     0.1000    0.0034
##     40        0.7351             nan     0.1000    0.0013
##     60        0.6948             nan     0.1000    0.0010
##     80        0.6708             nan     0.1000    0.0005
##    100        0.6549             nan     0.1000    0.0005
##    120        0.6443             nan     0.1000    0.0001
##    140        0.6368             nan     0.1000   -0.0000
##    150        0.6338             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0435             nan     0.1000    0.0258
##      2        1.0041             nan     0.1000    0.0195
##      3        0.9712             nan     0.1000    0.0165
##      4        0.9433             nan     0.1000    0.0135
##      5        0.9149             nan     0.1000    0.0143
##      6        0.8914             nan     0.1000    0.0119
##      7        0.8750             nan     0.1000    0.0080
##      8        0.8604             nan     0.1000    0.0073
##      9        0.8438             nan     0.1000    0.0083
##     10        0.8322             nan     0.1000    0.0056
##     20        0.7523             nan     0.1000    0.0026
##     40        0.6788             nan     0.1000    0.0008
##     60        0.6494             nan     0.1000    0.0004
##     80        0.6336             nan     0.1000    0.0003
##    100        0.6214             nan     0.1000    0.0004
##    120        0.6137             nan     0.1000    0.0000
##    140        0.6064             nan     0.1000    0.0000
##    150        0.6035             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0356             nan     0.1000    0.0296
##      2        0.9893             nan     0.1000    0.0231
##      3        0.9518             nan     0.1000    0.0188
##      4        0.9205             nan     0.1000    0.0155
##      5        0.8952             nan     0.1000    0.0124
##      6        0.8726             nan     0.1000    0.0108
##      7        0.8540             nan     0.1000    0.0094
##      8        0.8349             nan     0.1000    0.0092
##      9        0.8208             nan     0.1000    0.0066
##     10        0.8069             nan     0.1000    0.0067
##     20        0.7217             nan     0.1000    0.0029
##     40        0.6548             nan     0.1000    0.0009
##     60        0.6284             nan     0.1000    0.0002
##     80        0.6129             nan     0.1000    0.0001
##    100        0.6023             nan     0.1000    0.0001
##    120        0.5958             nan     0.1000    0.0002
##    140        0.5887             nan     0.1000    0.0003
##    150        0.5852             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0584             nan     0.1000    0.0187
##      2        1.0284             nan     0.1000    0.0151
##      3        1.0035             nan     0.1000    0.0123
##      4        0.9832             nan     0.1000    0.0105
##      5        0.9638             nan     0.1000    0.0098
##      6        0.9502             nan     0.1000    0.0068
##      7        0.9341             nan     0.1000    0.0082
##      8        0.9179             nan     0.1000    0.0078
##      9        0.9052             nan     0.1000    0.0063
##     10        0.8923             nan     0.1000    0.0063
##     20        0.8140             nan     0.1000    0.0023
##     40        0.7343             nan     0.1000    0.0012
##     60        0.6939             nan     0.1000    0.0009
##     80        0.6692             nan     0.1000    0.0004
##    100        0.6531             nan     0.1000    0.0003
##    120        0.6429             nan     0.1000    0.0001
##    140        0.6348             nan     0.1000    0.0003
##    150        0.6316             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0432             nan     0.1000    0.0258
##      2        1.0032             nan     0.1000    0.0197
##      3        0.9699             nan     0.1000    0.0169
##      4        0.9440             nan     0.1000    0.0131
##      5        0.9154             nan     0.1000    0.0147
##      6        0.8919             nan     0.1000    0.0119
##      7        0.8722             nan     0.1000    0.0100
##      8        0.8572             nan     0.1000    0.0078
##      9        0.8430             nan     0.1000    0.0071
##     10        0.8322             nan     0.1000    0.0054
##     20        0.7492             nan     0.1000    0.0026
##     40        0.6761             nan     0.1000    0.0014
##     60        0.6455             nan     0.1000    0.0005
##     80        0.6299             nan     0.1000    0.0001
##    100        0.6185             nan     0.1000    0.0001
##    120        0.6101             nan     0.1000    0.0001
##    140        0.6028             nan     0.1000    0.0000
##    150        0.6004             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0361             nan     0.1000    0.0297
##      2        0.9902             nan     0.1000    0.0233
##      3        0.9533             nan     0.1000    0.0183
##      4        0.9218             nan     0.1000    0.0157
##      5        0.8920             nan     0.1000    0.0151
##      6        0.8693             nan     0.1000    0.0112
##      7        0.8481             nan     0.1000    0.0103
##      8        0.8319             nan     0.1000    0.0078
##      9        0.8160             nan     0.1000    0.0078
##     10        0.8013             nan     0.1000    0.0074
##     20        0.7187             nan     0.1000    0.0030
##     40        0.6517             nan     0.1000    0.0006
##     60        0.6255             nan     0.1000    0.0005
##     80        0.6093             nan     0.1000    0.0001
##    100        0.6001             nan     0.1000    0.0001
##    120        0.5919             nan     0.1000    0.0000
##    140        0.5856             nan     0.1000    0.0000
##    150        0.5832             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0575             nan     0.1000    0.0186
##      2        1.0270             nan     0.1000    0.0150
##      3        1.0028             nan     0.1000    0.0122
##      4        0.9829             nan     0.1000    0.0097
##      5        0.9679             nan     0.1000    0.0070
##      6        0.9490             nan     0.1000    0.0096
##      7        0.9323             nan     0.1000    0.0081
##      8        0.9165             nan     0.1000    0.0078
##      9        0.9041             nan     0.1000    0.0061
##     10        0.8943             nan     0.1000    0.0048
##     20        0.8141             nan     0.1000    0.0023
##     40        0.7365             nan     0.1000    0.0016
##     60        0.6952             nan     0.1000    0.0008
##     80        0.6709             nan     0.1000    0.0003
##    100        0.6553             nan     0.1000    0.0003
##    120        0.6452             nan     0.1000    0.0001
##    140        0.6352             nan     0.1000    0.0001
##    150        0.6323             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0431             nan     0.1000    0.0259
##      2        1.0037             nan     0.1000    0.0196
##      3        0.9702             nan     0.1000    0.0169
##      4        0.9436             nan     0.1000    0.0134
##      5        0.9144             nan     0.1000    0.0146
##      6        0.8907             nan     0.1000    0.0116
##      7        0.8730             nan     0.1000    0.0086
##      8        0.8569             nan     0.1000    0.0080
##      9        0.8412             nan     0.1000    0.0081
##     10        0.8293             nan     0.1000    0.0060
##     20        0.7480             nan     0.1000    0.0029
##     40        0.6783             nan     0.1000    0.0010
##     60        0.6468             nan     0.1000    0.0004
##     80        0.6308             nan     0.1000    0.0004
##    100        0.6199             nan     0.1000    0.0002
##    120        0.6122             nan     0.1000    0.0000
##    140        0.6050             nan     0.1000    0.0001
##    150        0.6021             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0348             nan     0.1000    0.0297
##      2        0.9884             nan     0.1000    0.0235
##      3        0.9515             nan     0.1000    0.0187
##      4        0.9189             nan     0.1000    0.0163
##      5        0.8931             nan     0.1000    0.0126
##      6        0.8707             nan     0.1000    0.0110
##      7        0.8521             nan     0.1000    0.0095
##      8        0.8333             nan     0.1000    0.0094
##      9        0.8203             nan     0.1000    0.0064
##     10        0.8069             nan     0.1000    0.0066
##     20        0.7256             nan     0.1000    0.0024
##     40        0.6563             nan     0.1000    0.0010
##     60        0.6292             nan     0.1000    0.0005
##     80        0.6112             nan     0.1000    0.0001
##    100        0.6005             nan     0.1000   -0.0000
##    120        0.5940             nan     0.1000    0.0000
##    140        0.5870             nan     0.1000    0.0002
##    150        0.5850             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0581             nan     0.1000    0.0187
##      2        1.0281             nan     0.1000    0.0151
##      3        1.0032             nan     0.1000    0.0123
##      4        0.9827             nan     0.1000    0.0105
##      5        0.9627             nan     0.1000    0.0098
##      6        0.9492             nan     0.1000    0.0061
##      7        0.9331             nan     0.1000    0.0079
##      8        0.9163             nan     0.1000    0.0081
##      9        0.9063             nan     0.1000    0.0045
##     10        0.8935             nan     0.1000    0.0062
##     20        0.8112             nan     0.1000    0.0034
##     40        0.7354             nan     0.1000    0.0012
##     60        0.6934             nan     0.1000    0.0008
##     80        0.6688             nan     0.1000    0.0002
##    100        0.6536             nan     0.1000    0.0004
##    120        0.6416             nan     0.1000    0.0002
##    140        0.6344             nan     0.1000    0.0002
##    150        0.6312             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0443             nan     0.1000    0.0260
##      2        1.0051             nan     0.1000    0.0198
##      3        0.9720             nan     0.1000    0.0166
##      4        0.9446             nan     0.1000    0.0138
##      5        0.9156             nan     0.1000    0.0147
##      6        0.8923             nan     0.1000    0.0119
##      7        0.8747             nan     0.1000    0.0086
##      8        0.8582             nan     0.1000    0.0080
##      9        0.8423             nan     0.1000    0.0081
##     10        0.8308             nan     0.1000    0.0056
##     20        0.7479             nan     0.1000    0.0034
##     40        0.6775             nan     0.1000    0.0009
##     60        0.6456             nan     0.1000    0.0004
##     80        0.6290             nan     0.1000    0.0002
##    100        0.6187             nan     0.1000    0.0002
##    120        0.6097             nan     0.1000    0.0001
##    140        0.6034             nan     0.1000    0.0001
##    150        0.6013             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0371             nan     0.1000    0.0296
##      2        0.9897             nan     0.1000    0.0239
##      3        0.9524             nan     0.1000    0.0189
##      4        0.9203             nan     0.1000    0.0163
##      5        0.8956             nan     0.1000    0.0122
##      6        0.8703             nan     0.1000    0.0127
##      7        0.8486             nan     0.1000    0.0107
##      8        0.8330             nan     0.1000    0.0077
##      9        0.8169             nan     0.1000    0.0078
##     10        0.8058             nan     0.1000    0.0053
##     20        0.7204             nan     0.1000    0.0025
##     40        0.6523             nan     0.1000    0.0011
##     60        0.6250             nan     0.1000    0.0006
##     80        0.6098             nan     0.1000    0.0002
##    100        0.5997             nan     0.1000    0.0001
##    120        0.5913             nan     0.1000    0.0000
##    140        0.5854             nan     0.1000    0.0001
##    150        0.5832             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0187
##      2        1.0271             nan     0.1000    0.0151
##      3        1.0026             nan     0.1000    0.0122
##      4        0.9822             nan     0.1000    0.0097
##      5        0.9626             nan     0.1000    0.0098
##      6        0.9452             nan     0.1000    0.0086
##      7        0.9320             nan     0.1000    0.0064
##      8        0.9166             nan     0.1000    0.0078
##      9        0.9039             nan     0.1000    0.0064
##     10        0.8943             nan     0.1000    0.0048
##     20        0.8134             nan     0.1000    0.0024
##     40        0.7339             nan     0.1000    0.0020
##     60        0.6935             nan     0.1000    0.0007
##     80        0.6676             nan     0.1000    0.0007
##    100        0.6526             nan     0.1000    0.0004
##    120        0.6420             nan     0.1000    0.0001
##    140        0.6354             nan     0.1000    0.0001
##    150        0.6320             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0434             nan     0.1000    0.0262
##      2        1.0038             nan     0.1000    0.0200
##      3        0.9699             nan     0.1000    0.0164
##      4        0.9434             nan     0.1000    0.0134
##      5        0.9139             nan     0.1000    0.0148
##      6        0.8900             nan     0.1000    0.0121
##      7        0.8704             nan     0.1000    0.0100
##      8        0.8561             nan     0.1000    0.0071
##      9        0.8410             nan     0.1000    0.0075
##     10        0.8299             nan     0.1000    0.0053
##     20        0.7477             nan     0.1000    0.0032
##     40        0.6767             nan     0.1000    0.0009
##     60        0.6464             nan     0.1000    0.0005
##     80        0.6295             nan     0.1000    0.0004
##    100        0.6189             nan     0.1000    0.0002
##    120        0.6102             nan     0.1000    0.0003
##    140        0.6042             nan     0.1000   -0.0000
##    150        0.6012             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0362             nan     0.1000    0.0296
##      2        0.9896             nan     0.1000    0.0235
##      3        0.9520             nan     0.1000    0.0188
##      4        0.9194             nan     0.1000    0.0160
##      5        0.8934             nan     0.1000    0.0130
##      6        0.8711             nan     0.1000    0.0111
##      7        0.8493             nan     0.1000    0.0110
##      8        0.8334             nan     0.1000    0.0078
##      9        0.8181             nan     0.1000    0.0076
##     10        0.8057             nan     0.1000    0.0061
##     20        0.7192             nan     0.1000    0.0026
##     40        0.6519             nan     0.1000    0.0007
##     60        0.6249             nan     0.1000    0.0005
##     80        0.6101             nan     0.1000    0.0003
##    100        0.6003             nan     0.1000    0.0001
##    120        0.5919             nan     0.1000    0.0000
##    140        0.5869             nan     0.1000   -0.0000
##    150        0.5839             nan     0.1000    0.0003
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0578             nan     0.1000    0.0188
##      2        1.0275             nan     0.1000    0.0151
##      3        1.0035             nan     0.1000    0.0123
##      4        0.9831             nan     0.1000    0.0100
##      5        0.9632             nan     0.1000    0.0099
##      6        0.9460             nan     0.1000    0.0087
##      7        0.9328             nan     0.1000    0.0062
##      8        0.9171             nan     0.1000    0.0078
##      9        0.9044             nan     0.1000    0.0064
##     10        0.8946             nan     0.1000    0.0049
##     20        0.8158             nan     0.1000    0.0025
##     40        0.7345             nan     0.1000    0.0016
##     60        0.6924             nan     0.1000    0.0010
##     80        0.6671             nan     0.1000    0.0004
##    100        0.6524             nan     0.1000    0.0001
##    120        0.6415             nan     0.1000    0.0003
##    140        0.6331             nan     0.1000    0.0001
##    150        0.6305             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0428             nan     0.1000    0.0260
##      2        1.0028             nan     0.1000    0.0197
##      3        0.9699             nan     0.1000    0.0163
##      4        0.9428             nan     0.1000    0.0139
##      5        0.9138             nan     0.1000    0.0146
##      6        0.8900             nan     0.1000    0.0118
##      7        0.8702             nan     0.1000    0.0096
##      8        0.8553             nan     0.1000    0.0076
##      9        0.8434             nan     0.1000    0.0060
##     10        0.8304             nan     0.1000    0.0065
##     20        0.7491             nan     0.1000    0.0028
##     40        0.6737             nan     0.1000    0.0012
##     60        0.6443             nan     0.1000    0.0002
##     80        0.6278             nan     0.1000    0.0000
##    100        0.6170             nan     0.1000    0.0000
##    120        0.6089             nan     0.1000    0.0001
##    140        0.6029             nan     0.1000    0.0001
##    150        0.6003             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0363             nan     0.1000    0.0300
##      2        0.9893             nan     0.1000    0.0237
##      3        0.9510             nan     0.1000    0.0189
##      4        0.9194             nan     0.1000    0.0158
##      5        0.8938             nan     0.1000    0.0126
##      6        0.8713             nan     0.1000    0.0114
##      7        0.8491             nan     0.1000    0.0111
##      8        0.8309             nan     0.1000    0.0091
##      9        0.8176             nan     0.1000    0.0065
##     10        0.8046             nan     0.1000    0.0065
##     20        0.7191             nan     0.1000    0.0032
##     40        0.6516             nan     0.1000    0.0006
##     60        0.6252             nan     0.1000    0.0004
##     80        0.6083             nan     0.1000    0.0001
##    100        0.5981             nan     0.1000    0.0001
##    120        0.5903             nan     0.1000    0.0000
##    140        0.5839             nan     0.1000    0.0001
##    150        0.5809             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0184
##      2        1.0290             nan     0.1000    0.0148
##      3        1.0044             nan     0.1000    0.0121
##      4        0.9838             nan     0.1000    0.0098
##      5        0.9644             nan     0.1000    0.0097
##      6        0.9500             nan     0.1000    0.0070
##      7        0.9333             nan     0.1000    0.0083
##      8        0.9181             nan     0.1000    0.0077
##      9        0.9055             nan     0.1000    0.0065
##     10        0.8959             nan     0.1000    0.0049
##     20        0.8153             nan     0.1000    0.0024
##     40        0.7350             nan     0.1000    0.0015
##     60        0.6944             nan     0.1000    0.0005
##     80        0.6694             nan     0.1000    0.0006
##    100        0.6543             nan     0.1000    0.0002
##    120        0.6439             nan     0.1000    0.0002
##    140        0.6362             nan     0.1000    0.0001
##    150        0.6326             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0437             nan     0.1000    0.0257
##      2        1.0041             nan     0.1000    0.0198
##      3        0.9699             nan     0.1000    0.0162
##      4        0.9432             nan     0.1000    0.0131
##      5        0.9148             nan     0.1000    0.0144
##      6        0.8910             nan     0.1000    0.0116
##      7        0.8715             nan     0.1000    0.0095
##      8        0.8570             nan     0.1000    0.0072
##      9        0.8451             nan     0.1000    0.0059
##     10        0.8321             nan     0.1000    0.0066
##     20        0.7489             nan     0.1000    0.0027
##     40        0.6760             nan     0.1000    0.0008
##     60        0.6477             nan     0.1000    0.0005
##     80        0.6317             nan     0.1000    0.0004
##    100        0.6204             nan     0.1000    0.0002
##    120        0.6110             nan     0.1000    0.0002
##    140        0.6055             nan     0.1000    0.0002
##    150        0.6028             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0357             nan     0.1000    0.0293
##      2        0.9890             nan     0.1000    0.0234
##      3        0.9509             nan     0.1000    0.0191
##      4        0.9193             nan     0.1000    0.0156
##      5        0.8942             nan     0.1000    0.0124
##      6        0.8728             nan     0.1000    0.0103
##      7        0.8508             nan     0.1000    0.0112
##      8        0.8320             nan     0.1000    0.0093
##      9        0.8160             nan     0.1000    0.0081
##     10        0.8045             nan     0.1000    0.0057
##     20        0.7198             nan     0.1000    0.0026
##     40        0.6529             nan     0.1000    0.0007
##     60        0.6266             nan     0.1000    0.0004
##     80        0.6109             nan     0.1000    0.0001
##    100        0.6000             nan     0.1000    0.0001
##    120        0.5924             nan     0.1000   -0.0000
##    140        0.5871             nan     0.1000    0.0000
##    150        0.5844             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0576             nan     0.1000    0.0187
##      2        1.0271             nan     0.1000    0.0150
##      3        1.0033             nan     0.1000    0.0122
##      4        0.9831             nan     0.1000    0.0104
##      5        0.9635             nan     0.1000    0.0098
##      6        0.9463             nan     0.1000    0.0085
##      7        0.9332             nan     0.1000    0.0065
##      8        0.9179             nan     0.1000    0.0078
##      9        0.9046             nan     0.1000    0.0065
##     10        0.8916             nan     0.1000    0.0062
##     20        0.8140             nan     0.1000    0.0024
##     40        0.7362             nan     0.1000    0.0009
##     60        0.6947             nan     0.1000    0.0011
##     80        0.6694             nan     0.1000    0.0006
##    100        0.6543             nan     0.1000    0.0002
##    120        0.6435             nan     0.1000    0.0001
##    140        0.6359             nan     0.1000    0.0000
##    150        0.6327             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0436             nan     0.1000    0.0261
##      2        1.0038             nan     0.1000    0.0197
##      3        0.9707             nan     0.1000    0.0169
##      4        0.9442             nan     0.1000    0.0133
##      5        0.9152             nan     0.1000    0.0145
##      6        0.8915             nan     0.1000    0.0117
##      7        0.8721             nan     0.1000    0.0097
##      8        0.8557             nan     0.1000    0.0079
##      9        0.8424             nan     0.1000    0.0068
##     10        0.8312             nan     0.1000    0.0056
##     20        0.7492             nan     0.1000    0.0027
##     40        0.6767             nan     0.1000    0.0012
##     60        0.6461             nan     0.1000    0.0005
##     80        0.6307             nan     0.1000    0.0004
##    100        0.6195             nan     0.1000    0.0001
##    120        0.6116             nan     0.1000    0.0001
##    140        0.6045             nan     0.1000    0.0001
##    150        0.6018             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0370             nan     0.1000    0.0300
##      2        0.9905             nan     0.1000    0.0232
##      3        0.9530             nan     0.1000    0.0187
##      4        0.9209             nan     0.1000    0.0161
##      5        0.8948             nan     0.1000    0.0130
##      6        0.8725             nan     0.1000    0.0110
##      7        0.8535             nan     0.1000    0.0094
##      8        0.8345             nan     0.1000    0.0095
##      9        0.8180             nan     0.1000    0.0082
##     10        0.8058             nan     0.1000    0.0059
##     20        0.7208             nan     0.1000    0.0028
##     40        0.6539             nan     0.1000    0.0008
##     60        0.6273             nan     0.1000    0.0005
##     80        0.6098             nan     0.1000    0.0001
##    100        0.5999             nan     0.1000    0.0001
##    120        0.5921             nan     0.1000    0.0001
##    140        0.5868             nan     0.1000    0.0000
##    150        0.5845             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0582             nan     0.1000    0.0186
##      2        1.0282             nan     0.1000    0.0149
##      3        1.0035             nan     0.1000    0.0121
##      4        0.9832             nan     0.1000    0.0103
##      5        0.9640             nan     0.1000    0.0097
##      6        0.9471             nan     0.1000    0.0085
##      7        0.9339             nan     0.1000    0.0066
##      8        0.9184             nan     0.1000    0.0077
##      9        0.9087             nan     0.1000    0.0049
##     10        0.8959             nan     0.1000    0.0062
##     20        0.8137             nan     0.1000    0.0025
##     40        0.7360             nan     0.1000    0.0017
##     60        0.6956             nan     0.1000    0.0008
##     80        0.6705             nan     0.1000    0.0005
##    100        0.6545             nan     0.1000    0.0004
##    120        0.6433             nan     0.1000    0.0001
##    140        0.6350             nan     0.1000    0.0001
##    150        0.6323             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0437             nan     0.1000    0.0256
##      2        1.0041             nan     0.1000    0.0198
##      3        0.9712             nan     0.1000    0.0169
##      4        0.9436             nan     0.1000    0.0136
##      5        0.9141             nan     0.1000    0.0146
##      6        0.8910             nan     0.1000    0.0117
##      7        0.8714             nan     0.1000    0.0099
##      8        0.8559             nan     0.1000    0.0077
##      9        0.8437             nan     0.1000    0.0060
##     10        0.8308             nan     0.1000    0.0063
##     20        0.7500             nan     0.1000    0.0031
##     40        0.6765             nan     0.1000    0.0013
##     60        0.6467             nan     0.1000    0.0004
##     80        0.6303             nan     0.1000    0.0004
##    100        0.6186             nan     0.1000    0.0003
##    120        0.6107             nan     0.1000    0.0002
##    140        0.6045             nan     0.1000   -0.0000
##    150        0.6016             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0356             nan     0.1000    0.0296
##      2        0.9888             nan     0.1000    0.0231
##      3        0.9516             nan     0.1000    0.0188
##      4        0.9199             nan     0.1000    0.0155
##      5        0.8939             nan     0.1000    0.0129
##      6        0.8687             nan     0.1000    0.0121
##      7        0.8500             nan     0.1000    0.0092
##      8        0.8318             nan     0.1000    0.0090
##      9        0.8166             nan     0.1000    0.0075
##     10        0.8030             nan     0.1000    0.0068
##     20        0.7187             nan     0.1000    0.0035
##     40        0.6539             nan     0.1000    0.0005
##     60        0.6268             nan     0.1000    0.0007
##     80        0.6108             nan     0.1000    0.0002
##    100        0.6012             nan     0.1000    0.0001
##    120        0.5928             nan     0.1000    0.0002
##    140        0.5869             nan     0.1000    0.0001
##    150        0.5836             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0582             nan     0.1000    0.0184
##      2        1.0287             nan     0.1000    0.0148
##      3        1.0042             nan     0.1000    0.0121
##      4        0.9836             nan     0.1000    0.0102
##      5        0.9642             nan     0.1000    0.0096
##      6        0.9471             nan     0.1000    0.0084
##      7        0.9340             nan     0.1000    0.0064
##      8        0.9185             nan     0.1000    0.0076
##      9        0.9054             nan     0.1000    0.0063
##     10        0.8954             nan     0.1000    0.0049
##     20        0.8156             nan     0.1000    0.0034
##     40        0.7378             nan     0.1000    0.0009
##     60        0.6960             nan     0.1000    0.0008
##     80        0.6710             nan     0.1000    0.0004
##    100        0.6565             nan     0.1000    0.0003
##    120        0.6461             nan     0.1000    0.0003
##    140        0.6388             nan     0.1000    0.0002
##    150        0.6358             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0440             nan     0.1000    0.0261
##      2        1.0044             nan     0.1000    0.0196
##      3        0.9714             nan     0.1000    0.0165
##      4        0.9450             nan     0.1000    0.0131
##      5        0.9159             nan     0.1000    0.0144
##      6        0.8925             nan     0.1000    0.0117
##      7        0.8752             nan     0.1000    0.0085
##      8        0.8591             nan     0.1000    0.0080
##      9        0.8432             nan     0.1000    0.0078
##     10        0.8321             nan     0.1000    0.0056
##     20        0.7511             nan     0.1000    0.0032
##     40        0.6790             nan     0.1000    0.0011
##     60        0.6486             nan     0.1000    0.0005
##     80        0.6335             nan     0.1000    0.0002
##    100        0.6224             nan     0.1000    0.0000
##    120        0.6150             nan     0.1000    0.0001
##    140        0.6088             nan     0.1000    0.0001
##    150        0.6061             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0365             nan     0.1000    0.0290
##      2        0.9908             nan     0.1000    0.0231
##      3        0.9537             nan     0.1000    0.0187
##      4        0.9215             nan     0.1000    0.0159
##      5        0.8951             nan     0.1000    0.0131
##      6        0.8697             nan     0.1000    0.0124
##      7        0.8516             nan     0.1000    0.0090
##      8        0.8338             nan     0.1000    0.0089
##      9        0.8174             nan     0.1000    0.0083
##     10        0.8036             nan     0.1000    0.0067
##     20        0.7193             nan     0.1000    0.0026
##     40        0.6550             nan     0.1000    0.0009
##     60        0.6292             nan     0.1000    0.0003
##     80        0.6123             nan     0.1000    0.0001
##    100        0.6024             nan     0.1000    0.0001
##    120        0.5951             nan     0.1000    0.0001
##    140        0.5884             nan     0.1000    0.0000
##    150        0.5864             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0586             nan     0.1000    0.0186
##      2        1.0286             nan     0.1000    0.0150
##      3        1.0046             nan     0.1000    0.0122
##      4        0.9845             nan     0.1000    0.0102
##      5        0.9647             nan     0.1000    0.0098
##      6        0.9509             nan     0.1000    0.0068
##      7        0.9342             nan     0.1000    0.0083
##      8        0.9192             nan     0.1000    0.0078
##      9        0.9057             nan     0.1000    0.0064
##     10        0.8929             nan     0.1000    0.0063
##     20        0.8154             nan     0.1000    0.0024
##     40        0.7374             nan     0.1000    0.0015
##     60        0.6954             nan     0.1000    0.0005
##     80        0.6712             nan     0.1000    0.0003
##    100        0.6563             nan     0.1000    0.0006
##    120        0.6441             nan     0.1000    0.0002
##    140        0.6367             nan     0.1000    0.0001
##    150        0.6333             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0432             nan     0.1000    0.0260
##      2        1.0034             nan     0.1000    0.0197
##      3        0.9702             nan     0.1000    0.0164
##      4        0.9428             nan     0.1000    0.0136
##      5        0.9144             nan     0.1000    0.0143
##      6        0.8909             nan     0.1000    0.0113
##      7        0.8718             nan     0.1000    0.0095
##      8        0.8553             nan     0.1000    0.0081
##      9        0.8420             nan     0.1000    0.0067
##     10        0.8323             nan     0.1000    0.0047
##     20        0.7509             nan     0.1000    0.0026
##     40        0.6769             nan     0.1000    0.0010
##     60        0.6481             nan     0.1000    0.0003
##     80        0.6322             nan     0.1000    0.0002
##    100        0.6208             nan     0.1000    0.0003
##    120        0.6123             nan     0.1000    0.0002
##    140        0.6067             nan     0.1000    0.0001
##    150        0.6036             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0360             nan     0.1000    0.0296
##      2        0.9902             nan     0.1000    0.0234
##      3        0.9532             nan     0.1000    0.0181
##      4        0.9213             nan     0.1000    0.0162
##      5        0.8956             nan     0.1000    0.0128
##      6        0.8702             nan     0.1000    0.0127
##      7        0.8511             nan     0.1000    0.0094
##      8        0.8328             nan     0.1000    0.0090
##      9        0.8169             nan     0.1000    0.0076
##     10        0.8026             nan     0.1000    0.0070
##     20        0.7207             nan     0.1000    0.0027
##     40        0.6540             nan     0.1000    0.0012
##     60        0.6270             nan     0.1000    0.0003
##     80        0.6116             nan     0.1000    0.0001
##    100        0.6013             nan     0.1000    0.0000
##    120        0.5926             nan     0.1000    0.0000
##    140        0.5867             nan     0.1000    0.0000
##    150        0.5840             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0579             nan     0.1000    0.0188
##      2        1.0280             nan     0.1000    0.0152
##      3        1.0031             nan     0.1000    0.0124
##      4        0.9822             nan     0.1000    0.0103
##      5        0.9625             nan     0.1000    0.0099
##      6        0.9445             nan     0.1000    0.0085
##      7        0.9311             nan     0.1000    0.0067
##      8        0.9157             nan     0.1000    0.0079
##      9        0.9056             nan     0.1000    0.0049
##     10        0.8932             nan     0.1000    0.0063
##     20        0.8097             nan     0.1000    0.0031
##     40        0.7327             nan     0.1000    0.0011
##     60        0.6908             nan     0.1000    0.0009
##     80        0.6661             nan     0.1000    0.0006
##    100        0.6509             nan     0.1000    0.0002
##    120        0.6397             nan     0.1000    0.0003
##    140        0.6323             nan     0.1000    0.0001
##    150        0.6291             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0430             nan     0.1000    0.0267
##      2        1.0026             nan     0.1000    0.0202
##      3        0.9692             nan     0.1000    0.0167
##      4        0.9406             nan     0.1000    0.0138
##      5        0.9118             nan     0.1000    0.0146
##      6        0.8881             nan     0.1000    0.0117
##      7        0.8705             nan     0.1000    0.0088
##      8        0.8555             nan     0.1000    0.0074
##      9        0.8418             nan     0.1000    0.0066
##     10        0.8294             nan     0.1000    0.0061
##     20        0.7440             nan     0.1000    0.0027
##     40        0.6737             nan     0.1000    0.0009
##     60        0.6434             nan     0.1000    0.0004
##     80        0.6276             nan     0.1000    0.0001
##    100        0.6170             nan     0.1000    0.0001
##    120        0.6086             nan     0.1000    0.0001
##    140        0.6029             nan     0.1000    0.0001
##    150        0.6007             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0352             nan     0.1000    0.0296
##      2        0.9871             nan     0.1000    0.0239
##      3        0.9494             nan     0.1000    0.0193
##      4        0.9180             nan     0.1000    0.0157
##      5        0.8925             nan     0.1000    0.0126
##      6        0.8691             nan     0.1000    0.0115
##      7        0.8500             nan     0.1000    0.0093
##      8        0.8308             nan     0.1000    0.0095
##      9        0.8147             nan     0.1000    0.0080
##     10        0.7993             nan     0.1000    0.0076
##     20        0.7161             nan     0.1000    0.0031
##     40        0.6499             nan     0.1000    0.0008
##     60        0.6215             nan     0.1000    0.0005
##     80        0.6070             nan     0.1000    0.0001
##    100        0.5963             nan     0.1000    0.0001
##    120        0.5899             nan     0.1000    0.0000
##    140        0.5845             nan     0.1000    0.0003
##    150        0.5810             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0585             nan     0.1000    0.0186
##      2        1.0283             nan     0.1000    0.0150
##      3        1.0034             nan     0.1000    0.0121
##      4        0.9829             nan     0.1000    0.0102
##      5        0.9638             nan     0.1000    0.0097
##      6        0.9499             nan     0.1000    0.0071
##      7        0.9329             nan     0.1000    0.0083
##      8        0.9174             nan     0.1000    0.0077
##      9        0.9044             nan     0.1000    0.0064
##     10        0.8950             nan     0.1000    0.0048
##     20        0.8149             nan     0.1000    0.0023
##     40        0.7363             nan     0.1000    0.0012
##     60        0.6957             nan     0.1000    0.0008
##     80        0.6709             nan     0.1000    0.0003
##    100        0.6556             nan     0.1000    0.0005
##    120        0.6460             nan     0.1000    0.0001
##    140        0.6373             nan     0.1000    0.0001
##    150        0.6344             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0437             nan     0.1000    0.0259
##      2        1.0041             nan     0.1000    0.0196
##      3        0.9713             nan     0.1000    0.0163
##      4        0.9453             nan     0.1000    0.0131
##      5        0.9162             nan     0.1000    0.0147
##      6        0.8925             nan     0.1000    0.0119
##      7        0.8731             nan     0.1000    0.0098
##      8        0.8588             nan     0.1000    0.0073
##      9        0.8471             nan     0.1000    0.0058
##     10        0.8339             nan     0.1000    0.0066
##     20        0.7482             nan     0.1000    0.0033
##     40        0.6791             nan     0.1000    0.0007
##     60        0.6498             nan     0.1000    0.0006
##     80        0.6340             nan     0.1000    0.0004
##    100        0.6219             nan     0.1000    0.0001
##    120        0.6137             nan     0.1000    0.0000
##    140        0.6068             nan     0.1000    0.0002
##    150        0.6043             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0371             nan     0.1000    0.0291
##      2        0.9892             nan     0.1000    0.0238
##      3        0.9517             nan     0.1000    0.0186
##      4        0.9208             nan     0.1000    0.0154
##      5        0.8912             nan     0.1000    0.0147
##      6        0.8697             nan     0.1000    0.0107
##      7        0.8489             nan     0.1000    0.0102
##      8        0.8306             nan     0.1000    0.0089
##      9        0.8170             nan     0.1000    0.0066
##     10        0.8056             nan     0.1000    0.0055
##     20        0.7221             nan     0.1000    0.0018
##     40        0.6537             nan     0.1000    0.0006
##     60        0.6255             nan     0.1000    0.0004
##     80        0.6093             nan     0.1000    0.0003
##    100        0.5993             nan     0.1000    0.0000
##    120        0.5928             nan     0.1000    0.0001
##    140        0.5870             nan     0.1000   -0.0001
##    150        0.5845             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0186
##      2        1.0284             nan     0.1000    0.0150
##      3        1.0034             nan     0.1000    0.0122
##      4        0.9831             nan     0.1000    0.0101
##      5        0.9685             nan     0.1000    0.0072
##      6        0.9494             nan     0.1000    0.0096
##      7        0.9327             nan     0.1000    0.0083
##      8        0.9172             nan     0.1000    0.0078
##      9        0.9041             nan     0.1000    0.0063
##     10        0.8911             nan     0.1000    0.0063
##     20        0.8149             nan     0.1000    0.0022
##     40        0.7350             nan     0.1000    0.0013
##     60        0.6951             nan     0.1000    0.0007
##     80        0.6696             nan     0.1000    0.0002
##    100        0.6547             nan     0.1000    0.0002
##    120        0.6433             nan     0.1000    0.0002
##    140        0.6357             nan     0.1000    0.0001
##    150        0.6325             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0432             nan     0.1000    0.0261
##      2        1.0034             nan     0.1000    0.0199
##      3        0.9708             nan     0.1000    0.0161
##      4        0.9355             nan     0.1000    0.0172
##      5        0.9081             nan     0.1000    0.0137
##      6        0.8892             nan     0.1000    0.0096
##      7        0.8691             nan     0.1000    0.0101
##      8        0.8535             nan     0.1000    0.0078
##      9        0.8403             nan     0.1000    0.0063
##     10        0.8286             nan     0.1000    0.0059
##     20        0.7466             nan     0.1000    0.0029
##     40        0.6757             nan     0.1000    0.0013
##     60        0.6467             nan     0.1000    0.0003
##     80        0.6310             nan     0.1000    0.0004
##    100        0.6190             nan     0.1000    0.0003
##    120        0.6105             nan     0.1000    0.0001
##    140        0.6041             nan     0.1000    0.0000
##    150        0.6020             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0352             nan     0.1000    0.0297
##      2        0.9910             nan     0.1000    0.0221
##      3        0.9520             nan     0.1000    0.0192
##      4        0.9209             nan     0.1000    0.0156
##      5        0.8949             nan     0.1000    0.0131
##      6        0.8734             nan     0.1000    0.0106
##      7        0.8518             nan     0.1000    0.0106
##      8        0.8331             nan     0.1000    0.0094
##      9        0.8178             nan     0.1000    0.0076
##     10        0.8051             nan     0.1000    0.0063
##     20        0.7192             nan     0.1000    0.0030
##     40        0.6520             nan     0.1000    0.0007
##     60        0.6260             nan     0.1000    0.0003
##     80        0.6105             nan     0.1000    0.0001
##    100        0.5997             nan     0.1000    0.0001
##    120        0.5928             nan     0.1000    0.0001
##    140        0.5877             nan     0.1000    0.0000
##    150        0.5846             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0581             nan     0.1000    0.0187
##      2        1.0283             nan     0.1000    0.0151
##      3        1.0030             nan     0.1000    0.0122
##      4        0.9827             nan     0.1000    0.0099
##      5        0.9635             nan     0.1000    0.0098
##      6        0.9496             nan     0.1000    0.0067
##      7        0.9333             nan     0.1000    0.0082
##      8        0.9179             nan     0.1000    0.0078
##      9        0.9046             nan     0.1000    0.0063
##     10        0.8916             nan     0.1000    0.0064
##     20        0.8132             nan     0.1000    0.0033
##     40        0.7348             nan     0.1000    0.0018
##     60        0.6939             nan     0.1000    0.0008
##     80        0.6701             nan     0.1000    0.0005
##    100        0.6539             nan     0.1000    0.0003
##    120        0.6446             nan     0.1000    0.0003
##    140        0.6360             nan     0.1000    0.0001
##    150        0.6332             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0429             nan     0.1000    0.0264
##      2        1.0034             nan     0.1000    0.0200
##      3        0.9702             nan     0.1000    0.0170
##      4        0.9434             nan     0.1000    0.0133
##      5        0.9140             nan     0.1000    0.0146
##      6        0.8907             nan     0.1000    0.0118
##      7        0.8713             nan     0.1000    0.0096
##      8        0.8562             nan     0.1000    0.0075
##      9        0.8419             nan     0.1000    0.0070
##     10        0.8307             nan     0.1000    0.0056
##     20        0.7492             nan     0.1000    0.0024
##     40        0.6763             nan     0.1000    0.0009
##     60        0.6458             nan     0.1000    0.0007
##     80        0.6305             nan     0.1000    0.0002
##    100        0.6191             nan     0.1000    0.0001
##    120        0.6114             nan     0.1000    0.0001
##    140        0.6048             nan     0.1000    0.0000
##    150        0.6018             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0366             nan     0.1000    0.0293
##      2        0.9886             nan     0.1000    0.0237
##      3        0.9516             nan     0.1000    0.0187
##      4        0.9193             nan     0.1000    0.0159
##      5        0.8936             nan     0.1000    0.0130
##      6        0.8684             nan     0.1000    0.0126
##      7        0.8497             nan     0.1000    0.0095
##      8        0.8314             nan     0.1000    0.0091
##      9        0.8182             nan     0.1000    0.0064
##     10        0.8054             nan     0.1000    0.0061
##     20        0.7195             nan     0.1000    0.0027
##     40        0.6526             nan     0.1000    0.0009
##     60        0.6265             nan     0.1000    0.0006
##     80        0.6108             nan     0.1000    0.0003
##    100        0.5995             nan     0.1000    0.0003
##    120        0.5920             nan     0.1000    0.0001
##    140        0.5856             nan     0.1000    0.0000
##    150        0.5833             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0588             nan     0.1000    0.0187
##      2        1.0282             nan     0.1000    0.0151
##      3        1.0037             nan     0.1000    0.0123
##      4        0.9837             nan     0.1000    0.0101
##      5        0.9639             nan     0.1000    0.0098
##      6        0.9499             nan     0.1000    0.0069
##      7        0.9339             nan     0.1000    0.0079
##      8        0.9186             nan     0.1000    0.0078
##      9        0.9087             nan     0.1000    0.0048
##     10        0.8955             nan     0.1000    0.0064
##     20        0.8145             nan     0.1000    0.0033
##     40        0.7372             nan     0.1000    0.0015
##     60        0.6971             nan     0.1000    0.0008
##     80        0.6717             nan     0.1000    0.0006
##    100        0.6566             nan     0.1000    0.0002
##    120        0.6451             nan     0.1000    0.0001
##    140        0.6369             nan     0.1000    0.0001
##    150        0.6341             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0438             nan     0.1000    0.0256
##      2        1.0041             nan     0.1000    0.0194
##      3        0.9705             nan     0.1000    0.0167
##      4        0.9440             nan     0.1000    0.0132
##      5        0.9153             nan     0.1000    0.0145
##      6        0.8912             nan     0.1000    0.0116
##      7        0.8717             nan     0.1000    0.0096
##      8        0.8576             nan     0.1000    0.0071
##      9        0.8437             nan     0.1000    0.0069
##     10        0.8317             nan     0.1000    0.0060
##     20        0.7500             nan     0.1000    0.0025
##     40        0.6773             nan     0.1000    0.0010
##     60        0.6483             nan     0.1000    0.0004
##     80        0.6320             nan     0.1000    0.0002
##    100        0.6223             nan     0.1000    0.0001
##    120        0.6129             nan     0.1000    0.0001
##    140        0.6073             nan     0.1000    0.0001
##    150        0.6038             nan     0.1000    0.0003
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0366             nan     0.1000    0.0298
##      2        0.9914             nan     0.1000    0.0227
##      3        0.9517             nan     0.1000    0.0200
##      4        0.9208             nan     0.1000    0.0152
##      5        0.8956             nan     0.1000    0.0126
##      6        0.8733             nan     0.1000    0.0112
##      7        0.8518             nan     0.1000    0.0109
##      8        0.8338             nan     0.1000    0.0091
##      9        0.8182             nan     0.1000    0.0077
##     10        0.8044             nan     0.1000    0.0066
##     20        0.7205             nan     0.1000    0.0030
##     40        0.6553             nan     0.1000    0.0009
##     60        0.6267             nan     0.1000    0.0005
##     80        0.6114             nan     0.1000    0.0006
##    100        0.6006             nan     0.1000    0.0003
##    120        0.5932             nan     0.1000    0.0000
##    140        0.5877             nan     0.1000    0.0000
##    150        0.5848             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0589             nan     0.1000    0.0184
##      2        1.0289             nan     0.1000    0.0149
##      3        1.0043             nan     0.1000    0.0121
##      4        0.9840             nan     0.1000    0.0108
##      5        0.9643             nan     0.1000    0.0096
##      6        0.9503             nan     0.1000    0.0071
##      7        0.9335             nan     0.1000    0.0084
##      8        0.9184             nan     0.1000    0.0076
##      9        0.9085             nan     0.1000    0.0049
##     10        0.8961             nan     0.1000    0.0064
##     20        0.8131             nan     0.1000    0.0032
##     40        0.7352             nan     0.1000    0.0011
##     60        0.6947             nan     0.1000    0.0005
##     80        0.6709             nan     0.1000    0.0007
##    100        0.6549             nan     0.1000    0.0004
##    120        0.6440             nan     0.1000    0.0001
##    140        0.6362             nan     0.1000    0.0003
##    150        0.6334             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0438             nan     0.1000    0.0259
##      2        1.0047             nan     0.1000    0.0196
##      3        0.9717             nan     0.1000    0.0167
##      4        0.9431             nan     0.1000    0.0136
##      5        0.9143             nan     0.1000    0.0142
##      6        0.8912             nan     0.1000    0.0117
##      7        0.8747             nan     0.1000    0.0081
##      8        0.8596             nan     0.1000    0.0073
##      9        0.8435             nan     0.1000    0.0079
##     10        0.8299             nan     0.1000    0.0069
##     20        0.7514             nan     0.1000    0.0020
##     40        0.6773             nan     0.1000    0.0008
##     60        0.6473             nan     0.1000    0.0004
##     80        0.6311             nan     0.1000    0.0004
##    100        0.6200             nan     0.1000    0.0001
##    120        0.6123             nan     0.1000    0.0001
##    140        0.6058             nan     0.1000    0.0002
##    150        0.6036             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0367             nan     0.1000    0.0294
##      2        0.9898             nan     0.1000    0.0235
##      3        0.9548             nan     0.1000    0.0176
##      4        0.9233             nan     0.1000    0.0157
##      5        0.8969             nan     0.1000    0.0130
##      6        0.8716             nan     0.1000    0.0128
##      7        0.8497             nan     0.1000    0.0108
##      8        0.8321             nan     0.1000    0.0091
##      9        0.8183             nan     0.1000    0.0066
##     10        0.8035             nan     0.1000    0.0072
##     20        0.7202             nan     0.1000    0.0025
##     40        0.6526             nan     0.1000    0.0010
##     60        0.6256             nan     0.1000    0.0004
##     80        0.6098             nan     0.1000    0.0002
##    100        0.6003             nan     0.1000   -0.0000
##    120        0.5918             nan     0.1000    0.0001
##    140        0.5863             nan     0.1000    0.0001
##    150        0.5838             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0577             nan     0.1000    0.0185
##      2        1.0273             nan     0.1000    0.0148
##      3        1.0032             nan     0.1000    0.0120
##      4        0.9825             nan     0.1000    0.0102
##      5        0.9642             nan     0.1000    0.0089
##      6        0.9451             nan     0.1000    0.0095
##      7        0.9317             nan     0.1000    0.0065
##      8        0.9165             nan     0.1000    0.0077
##      9        0.9033             nan     0.1000    0.0065
##     10        0.8935             nan     0.1000    0.0047
##     20        0.8113             nan     0.1000    0.0033
##     40        0.7342             nan     0.1000    0.0015
##     60        0.6921             nan     0.1000    0.0005
##     80        0.6674             nan     0.1000    0.0007
##    100        0.6519             nan     0.1000    0.0004
##    120        0.6409             nan     0.1000    0.0001
##    140        0.6341             nan     0.1000    0.0000
##    150        0.6305             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0431             nan     0.1000    0.0255
##      2        1.0036             nan     0.1000    0.0197
##      3        0.9705             nan     0.1000    0.0164
##      4        0.9430             nan     0.1000    0.0135
##      5        0.9134             nan     0.1000    0.0147
##      6        0.8945             nan     0.1000    0.0093
##      7        0.8736             nan     0.1000    0.0103
##      8        0.8568             nan     0.1000    0.0083
##      9        0.8431             nan     0.1000    0.0068
##     10        0.8296             nan     0.1000    0.0068
##     20        0.7481             nan     0.1000    0.0031
##     40        0.6746             nan     0.1000    0.0011
##     60        0.6438             nan     0.1000    0.0005
##     80        0.6291             nan     0.1000    0.0002
##    100        0.6172             nan     0.1000    0.0001
##    120        0.6090             nan     0.1000    0.0002
##    140        0.6017             nan     0.1000    0.0001
##    150        0.5993             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0361             nan     0.1000    0.0298
##      2        0.9893             nan     0.1000    0.0233
##      3        0.9519             nan     0.1000    0.0185
##      4        0.9192             nan     0.1000    0.0163
##      5        0.8928             nan     0.1000    0.0131
##      6        0.8705             nan     0.1000    0.0110
##      7        0.8487             nan     0.1000    0.0109
##      8        0.8336             nan     0.1000    0.0074
##      9        0.8190             nan     0.1000    0.0070
##     10        0.8032             nan     0.1000    0.0077
##     20        0.7153             nan     0.1000    0.0029
##     40        0.6495             nan     0.1000    0.0009
##     60        0.6226             nan     0.1000    0.0004
##     80        0.6080             nan     0.1000    0.0002
##    100        0.5976             nan     0.1000    0.0002
##    120        0.5888             nan     0.1000    0.0001
##    140        0.5837             nan     0.1000    0.0001
##    150        0.5811             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0580             nan     0.1000    0.0187
##      2        1.0276             nan     0.1000    0.0150
##      3        1.0034             nan     0.1000    0.0122
##      4        0.9832             nan     0.1000    0.0097
##      5        0.9634             nan     0.1000    0.0098
##      6        0.9463             nan     0.1000    0.0083
##      7        0.9331             nan     0.1000    0.0065
##      8        0.9175             nan     0.1000    0.0078
##      9        0.9041             nan     0.1000    0.0064
##     10        0.8918             nan     0.1000    0.0062
##     20        0.8138             nan     0.1000    0.0034
##     40        0.7352             nan     0.1000    0.0012
##     60        0.6944             nan     0.1000    0.0011
##     80        0.6704             nan     0.1000    0.0007
##    100        0.6559             nan     0.1000    0.0004
##    120        0.6445             nan     0.1000    0.0002
##    140        0.6364             nan     0.1000    0.0002
##    150        0.6337             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0437             nan     0.1000    0.0258
##      2        1.0026             nan     0.1000    0.0198
##      3        0.9692             nan     0.1000    0.0163
##      4        0.9418             nan     0.1000    0.0137
##      5        0.9128             nan     0.1000    0.0141
##      6        0.8896             nan     0.1000    0.0116
##      7        0.8699             nan     0.1000    0.0096
##      8        0.8556             nan     0.1000    0.0072
##      9        0.8404             nan     0.1000    0.0076
##     10        0.8310             nan     0.1000    0.0046
##     20        0.7502             nan     0.1000    0.0032
##     40        0.6768             nan     0.1000    0.0009
##     60        0.6461             nan     0.1000    0.0004
##     80        0.6303             nan     0.1000    0.0002
##    100        0.6198             nan     0.1000    0.0003
##    120        0.6112             nan     0.1000    0.0002
##    140        0.6053             nan     0.1000    0.0000
##    150        0.6025             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0356             nan     0.1000    0.0293
##      2        0.9890             nan     0.1000    0.0231
##      3        0.9507             nan     0.1000    0.0188
##      4        0.9195             nan     0.1000    0.0157
##      5        0.8943             nan     0.1000    0.0126
##      6        0.8718             nan     0.1000    0.0114
##      7        0.8525             nan     0.1000    0.0095
##      8        0.8340             nan     0.1000    0.0095
##      9        0.8184             nan     0.1000    0.0078
##     10        0.8068             nan     0.1000    0.0058
##     20        0.7227             nan     0.1000    0.0034
##     40        0.6531             nan     0.1000    0.0010
##     60        0.6266             nan     0.1000    0.0005
##     80        0.6107             nan     0.1000    0.0004
##    100        0.6016             nan     0.1000    0.0000
##    120        0.5928             nan     0.1000    0.0001
##    140        0.5878             nan     0.1000    0.0002
##    150        0.5858             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0581             nan     0.1000    0.0186
##      2        1.0290             nan     0.1000    0.0150
##      3        1.0044             nan     0.1000    0.0122
##      4        0.9839             nan     0.1000    0.0103
##      5        0.9644             nan     0.1000    0.0098
##      6        0.9501             nan     0.1000    0.0069
##      7        0.9333             nan     0.1000    0.0082
##      8        0.9180             nan     0.1000    0.0078
##      9        0.9078             nan     0.1000    0.0050
##     10        0.8954             nan     0.1000    0.0062
##     20        0.8121             nan     0.1000    0.0033
##     40        0.7347             nan     0.1000    0.0015
##     60        0.6944             nan     0.1000    0.0007
##     80        0.6690             nan     0.1000    0.0006
##    100        0.6535             nan     0.1000    0.0002
##    120        0.6427             nan     0.1000    0.0002
##    140        0.6344             nan     0.1000    0.0000
##    150        0.6313             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0447             nan     0.1000    0.0259
##      2        1.0046             nan     0.1000    0.0202
##      3        0.9711             nan     0.1000    0.0170
##      4        0.9442             nan     0.1000    0.0135
##      5        0.9151             nan     0.1000    0.0144
##      6        0.8918             nan     0.1000    0.0117
##      7        0.8743             nan     0.1000    0.0087
##      8        0.8583             nan     0.1000    0.0081
##      9        0.8421             nan     0.1000    0.0081
##     10        0.8298             nan     0.1000    0.0062
##     20        0.7475             nan     0.1000    0.0031
##     40        0.6756             nan     0.1000    0.0008
##     60        0.6462             nan     0.1000    0.0003
##     80        0.6286             nan     0.1000    0.0003
##    100        0.6184             nan     0.1000    0.0001
##    120        0.6094             nan     0.1000    0.0000
##    140        0.6031             nan     0.1000    0.0000
##    150        0.6003             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0355             nan     0.1000    0.0295
##      2        0.9890             nan     0.1000    0.0234
##      3        0.9514             nan     0.1000    0.0185
##      4        0.9186             nan     0.1000    0.0159
##      5        0.8930             nan     0.1000    0.0127
##      6        0.8702             nan     0.1000    0.0111
##      7        0.8479             nan     0.1000    0.0113
##      8        0.8301             nan     0.1000    0.0090
##      9        0.8170             nan     0.1000    0.0063
##     10        0.8044             nan     0.1000    0.0062
##     20        0.7174             nan     0.1000    0.0029
##     40        0.6523             nan     0.1000    0.0006
##     60        0.6258             nan     0.1000    0.0002
##     80        0.6097             nan     0.1000    0.0002
##    100        0.5988             nan     0.1000   -0.0000
##    120        0.5908             nan     0.1000    0.0002
##    140        0.5857             nan     0.1000    0.0000
##    150        0.5831             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0578             nan     0.1000    0.0189
##      2        1.0275             nan     0.1000    0.0152
##      3        1.0026             nan     0.1000    0.0124
##      4        0.9818             nan     0.1000    0.0101
##      5        0.9619             nan     0.1000    0.0099
##      6        0.9450             nan     0.1000    0.0084
##      7        0.9286             nan     0.1000    0.0080
##      8        0.9160             nan     0.1000    0.0066
##      9        0.9035             nan     0.1000    0.0063
##     10        0.8938             nan     0.1000    0.0048
##     20        0.8112             nan     0.1000    0.0033
##     40        0.7333             nan     0.1000    0.0012
##     60        0.6926             nan     0.1000    0.0005
##     80        0.6669             nan     0.1000    0.0007
##    100        0.6509             nan     0.1000    0.0002
##    120        0.6411             nan     0.1000    0.0001
##    140        0.6329             nan     0.1000    0.0003
##    150        0.6291             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0435             nan     0.1000    0.0261
##      2        1.0030             nan     0.1000    0.0200
##      3        0.9711             nan     0.1000    0.0158
##      4        0.9424             nan     0.1000    0.0144
##      5        0.9126             nan     0.1000    0.0147
##      6        0.8883             nan     0.1000    0.0118
##      7        0.8684             nan     0.1000    0.0097
##      8        0.8541             nan     0.1000    0.0072
##      9        0.8400             nan     0.1000    0.0069
##     10        0.8283             nan     0.1000    0.0057
##     20        0.7489             nan     0.1000    0.0027
##     40        0.6751             nan     0.1000    0.0012
##     60        0.6447             nan     0.1000    0.0004
##     80        0.6287             nan     0.1000    0.0002
##    100        0.6185             nan     0.1000    0.0001
##    120        0.6103             nan     0.1000    0.0001
##    140        0.6036             nan     0.1000    0.0001
##    150        0.6010             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0357             nan     0.1000    0.0296
##      2        0.9892             nan     0.1000    0.0237
##      3        0.9507             nan     0.1000    0.0189
##      4        0.9196             nan     0.1000    0.0156
##      5        0.8927             nan     0.1000    0.0135
##      6        0.8704             nan     0.1000    0.0110
##      7        0.8481             nan     0.1000    0.0110
##      8        0.8297             nan     0.1000    0.0090
##      9        0.8168             nan     0.1000    0.0064
##     10        0.8025             nan     0.1000    0.0068
##     20        0.7194             nan     0.1000    0.0031
##     40        0.6509             nan     0.1000    0.0012
##     60        0.6246             nan     0.1000    0.0001
##     80        0.6093             nan     0.1000    0.0002
##    100        0.5991             nan     0.1000    0.0001
##    120        0.5911             nan     0.1000    0.0000
##    140        0.5859             nan     0.1000    0.0000
##    150        0.5834             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0589             nan     0.1000    0.0186
##      2        1.0292             nan     0.1000    0.0150
##      3        1.0050             nan     0.1000    0.0122
##      4        0.9847             nan     0.1000    0.0100
##      5        0.9655             nan     0.1000    0.0098
##      6        0.9481             nan     0.1000    0.0084
##      7        0.9350             nan     0.1000    0.0067
##      8        0.9201             nan     0.1000    0.0078
##      9        0.9099             nan     0.1000    0.0050
##     10        0.8975             nan     0.1000    0.0061
##     20        0.8166             nan     0.1000    0.0023
##     40        0.7378             nan     0.1000    0.0011
##     60        0.6980             nan     0.1000    0.0013
##     80        0.6730             nan     0.1000    0.0005
##    100        0.6567             nan     0.1000    0.0002
##    120        0.6464             nan     0.1000    0.0002
##    140        0.6383             nan     0.1000    0.0001
##    150        0.6353             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0436             nan     0.1000    0.0256
##      2        1.0040             nan     0.1000    0.0196
##      3        0.9709             nan     0.1000    0.0166
##      4        0.9430             nan     0.1000    0.0136
##      5        0.9144             nan     0.1000    0.0144
##      6        0.8915             nan     0.1000    0.0115
##      7        0.8722             nan     0.1000    0.0096
##      8        0.8576             nan     0.1000    0.0073
##      9        0.8457             nan     0.1000    0.0058
##     10        0.8338             nan     0.1000    0.0061
##     20        0.7518             nan     0.1000    0.0033
##     40        0.6794             nan     0.1000    0.0011
##     60        0.6498             nan     0.1000    0.0005
##     80        0.6326             nan     0.1000    0.0003
##    100        0.6213             nan     0.1000    0.0001
##    120        0.6132             nan     0.1000    0.0001
##    140        0.6062             nan     0.1000    0.0001
##    150        0.6033             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0360             nan     0.1000    0.0292
##      2        0.9893             nan     0.1000    0.0232
##      3        0.9515             nan     0.1000    0.0186
##      4        0.9204             nan     0.1000    0.0154
##      5        0.8916             nan     0.1000    0.0140
##      6        0.8697             nan     0.1000    0.0108
##      7        0.8485             nan     0.1000    0.0107
##      8        0.8311             nan     0.1000    0.0087
##      9        0.8185             nan     0.1000    0.0061
##     10        0.8057             nan     0.1000    0.0063
##     20        0.7200             nan     0.1000    0.0028
##     40        0.6555             nan     0.1000    0.0010
##     60        0.6296             nan     0.1000    0.0006
##     80        0.6138             nan     0.1000    0.0002
##    100        0.6036             nan     0.1000    0.0003
##    120        0.5960             nan     0.1000    0.0000
##    140        0.5897             nan     0.1000    0.0000
##    150        0.5877             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0582             nan     0.1000    0.0186
##      2        1.0283             nan     0.1000    0.0150
##      3        1.0041             nan     0.1000    0.0122
##      4        0.9837             nan     0.1000    0.0105
##      5        0.9645             nan     0.1000    0.0098
##      6        0.9468             nan     0.1000    0.0086
##      7        0.9332             nan     0.1000    0.0064
##      8        0.9171             nan     0.1000    0.0078
##      9        0.9071             nan     0.1000    0.0049
##     10        0.8946             nan     0.1000    0.0062
##     20        0.8153             nan     0.1000    0.0026
##     40        0.7344             nan     0.1000    0.0015
##     60        0.6945             nan     0.1000    0.0005
##     80        0.6695             nan     0.1000    0.0005
##    100        0.6545             nan     0.1000    0.0003
##    120        0.6432             nan     0.1000    0.0002
##    140        0.6353             nan     0.1000    0.0001
##    150        0.6325             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0440             nan     0.1000    0.0261
##      2        1.0041             nan     0.1000    0.0200
##      3        0.9711             nan     0.1000    0.0165
##      4        0.9436             nan     0.1000    0.0137
##      5        0.9144             nan     0.1000    0.0147
##      6        0.8912             nan     0.1000    0.0120
##      7        0.8718             nan     0.1000    0.0098
##      8        0.8568             nan     0.1000    0.0074
##      9        0.8448             nan     0.1000    0.0059
##     10        0.8328             nan     0.1000    0.0059
##     20        0.7495             nan     0.1000    0.0029
##     40        0.6758             nan     0.1000    0.0007
##     60        0.6462             nan     0.1000    0.0006
##     80        0.6302             nan     0.1000    0.0004
##    100        0.6196             nan     0.1000    0.0001
##    120        0.6108             nan     0.1000    0.0001
##    140        0.6049             nan     0.1000    0.0000
##    150        0.6024             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0358             nan     0.1000    0.0294
##      2        0.9890             nan     0.1000    0.0233
##      3        0.9520             nan     0.1000    0.0184
##      4        0.9196             nan     0.1000    0.0165
##      5        0.8936             nan     0.1000    0.0127
##      6        0.8682             nan     0.1000    0.0127
##      7        0.8475             nan     0.1000    0.0100
##      8        0.8312             nan     0.1000    0.0081
##      9        0.8156             nan     0.1000    0.0078
##     10        0.8012             nan     0.1000    0.0072
##     20        0.7166             nan     0.1000    0.0031
##     40        0.6518             nan     0.1000    0.0010
##     60        0.6261             nan     0.1000    0.0004
##     80        0.6106             nan     0.1000    0.0003
##    100        0.6004             nan     0.1000   -0.0000
##    120        0.5923             nan     0.1000    0.0001
##    140        0.5868             nan     0.1000    0.0001
##    150        0.5843             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0589             nan     0.1000    0.0185
##      2        1.0286             nan     0.1000    0.0149
##      3        1.0049             nan     0.1000    0.0121
##      4        0.9847             nan     0.1000    0.0101
##      5        0.9653             nan     0.1000    0.0097
##      6        0.9479             nan     0.1000    0.0086
##      7        0.9346             nan     0.1000    0.0067
##      8        0.9193             nan     0.1000    0.0078
##      9        0.9093             nan     0.1000    0.0049
##     10        0.8966             nan     0.1000    0.0062
##     20        0.8168             nan     0.1000    0.0035
##     40        0.7368             nan     0.1000    0.0011
##     60        0.6972             nan     0.1000    0.0005
##     80        0.6714             nan     0.1000    0.0004
##    100        0.6563             nan     0.1000    0.0005
##    120        0.6463             nan     0.1000    0.0001
##    140        0.6381             nan     0.1000    0.0001
##    150        0.6349             nan     0.1000    0.0002
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0435             nan     0.1000    0.0261
##      2        1.0040             nan     0.1000    0.0196
##      3        0.9723             nan     0.1000    0.0155
##      4        0.9441             nan     0.1000    0.0141
##      5        0.9151             nan     0.1000    0.0146
##      6        0.8919             nan     0.1000    0.0116
##      7        0.8727             nan     0.1000    0.0096
##      8        0.8574             nan     0.1000    0.0077
##      9        0.8428             nan     0.1000    0.0073
##     10        0.8320             nan     0.1000    0.0053
##     20        0.7515             nan     0.1000    0.0028
##     40        0.6798             nan     0.1000    0.0006
##     60        0.6494             nan     0.1000    0.0003
##     80        0.6329             nan     0.1000    0.0001
##    100        0.6214             nan     0.1000    0.0001
##    120        0.6133             nan     0.1000    0.0003
##    140        0.6070             nan     0.1000    0.0001
##    150        0.6044             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0368             nan     0.1000    0.0299
##      2        0.9906             nan     0.1000    0.0228
##      3        0.9529             nan     0.1000    0.0187
##      4        0.9219             nan     0.1000    0.0157
##      5        0.8963             nan     0.1000    0.0130
##      6        0.8736             nan     0.1000    0.0111
##      7        0.8525             nan     0.1000    0.0105
##      8        0.8362             nan     0.1000    0.0079
##      9        0.8200             nan     0.1000    0.0080
##     10        0.8084             nan     0.1000    0.0059
##     20        0.7230             nan     0.1000    0.0021
##     40        0.6562             nan     0.1000    0.0009
##     60        0.6286             nan     0.1000    0.0002
##     80        0.6135             nan     0.1000    0.0002
##    100        0.6020             nan     0.1000    0.0002
##    120        0.5950             nan     0.1000    0.0001
##    140        0.5896             nan     0.1000    0.0001
##    150        0.5879             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0577             nan     0.1000    0.0186
##      2        1.0281             nan     0.1000    0.0150
##      3        1.0038             nan     0.1000    0.0122
##      4        0.9832             nan     0.1000    0.0105
##      5        0.9638             nan     0.1000    0.0097
##      6        0.9498             nan     0.1000    0.0072
##      7        0.9331             nan     0.1000    0.0083
##      8        0.9181             nan     0.1000    0.0077
##      9        0.9082             nan     0.1000    0.0049
##     10        0.8952             nan     0.1000    0.0062
##     20        0.8124             nan     0.1000    0.0034
##     40        0.7361             nan     0.1000    0.0010
##     60        0.6940             nan     0.1000    0.0005
##     80        0.6701             nan     0.1000    0.0003
##    100        0.6530             nan     0.1000    0.0004
##    120        0.6437             nan     0.1000    0.0000
##    140        0.6354             nan     0.1000    0.0000
##    150        0.6323             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0436             nan     0.1000    0.0258
##      2        1.0049             nan     0.1000    0.0197
##      3        0.9733             nan     0.1000    0.0155
##      4        0.9445             nan     0.1000    0.0144
##      5        0.9153             nan     0.1000    0.0148
##      6        0.8913             nan     0.1000    0.0120
##      7        0.8717             nan     0.1000    0.0098
##      8        0.8572             nan     0.1000    0.0072
##      9        0.8430             nan     0.1000    0.0070
##     10        0.8313             nan     0.1000    0.0060
##     20        0.7475             nan     0.1000    0.0037
##     40        0.6765             nan     0.1000    0.0012
##     60        0.6460             nan     0.1000    0.0002
##     80        0.6293             nan     0.1000    0.0001
##    100        0.6177             nan     0.1000    0.0002
##    120        0.6090             nan     0.1000    0.0001
##    140        0.6031             nan     0.1000    0.0000
##    150        0.6004             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0359             nan     0.1000    0.0297
##      2        0.9896             nan     0.1000    0.0233
##      3        0.9514             nan     0.1000    0.0191
##      4        0.9202             nan     0.1000    0.0158
##      5        0.8937             nan     0.1000    0.0128
##      6        0.8692             nan     0.1000    0.0123
##      7        0.8483             nan     0.1000    0.0104
##      8        0.8330             nan     0.1000    0.0076
##      9        0.8168             nan     0.1000    0.0082
##     10        0.8043             nan     0.1000    0.0063
##     20        0.7204             nan     0.1000    0.0030
##     40        0.6545             nan     0.1000    0.0009
##     60        0.6269             nan     0.1000    0.0004
##     80        0.6115             nan     0.1000    0.0000
##    100        0.6003             nan     0.1000    0.0002
##    120        0.5920             nan     0.1000    0.0000
##    140        0.5855             nan     0.1000    0.0000
##    150        0.5834             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0579             nan     0.1000    0.0188
##      2        1.0283             nan     0.1000    0.0151
##      3        1.0031             nan     0.1000    0.0124
##      4        0.9827             nan     0.1000    0.0099
##      5        0.9629             nan     0.1000    0.0099
##      6        0.9457             nan     0.1000    0.0087
##      7        0.9323             nan     0.1000    0.0066
##      8        0.9161             nan     0.1000    0.0078
##      9        0.9029             nan     0.1000    0.0063
##     10        0.8932             nan     0.1000    0.0049
##     20        0.8127             nan     0.1000    0.0034
##     40        0.7320             nan     0.1000    0.0015
##     60        0.6930             nan     0.1000    0.0012
##     80        0.6677             nan     0.1000    0.0005
##    100        0.6532             nan     0.1000    0.0002
##    120        0.6410             nan     0.1000    0.0002
##    140        0.6336             nan     0.1000    0.0001
##    150        0.6308             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0434             nan     0.1000    0.0264
##      2        1.0037             nan     0.1000    0.0202
##      3        0.9704             nan     0.1000    0.0168
##      4        0.9437             nan     0.1000    0.0133
##      5        0.9137             nan     0.1000    0.0149
##      6        0.8900             nan     0.1000    0.0118
##      7        0.8704             nan     0.1000    0.0099
##      8        0.8550             nan     0.1000    0.0076
##      9        0.8404             nan     0.1000    0.0072
##     10        0.8292             nan     0.1000    0.0057
##     20        0.7462             nan     0.1000    0.0034
##     40        0.6745             nan     0.1000    0.0009
##     60        0.6445             nan     0.1000    0.0006
##     80        0.6290             nan     0.1000    0.0002
##    100        0.6176             nan     0.1000    0.0001
##    120        0.6101             nan     0.1000    0.0001
##    140        0.6047             nan     0.1000    0.0001
##    150        0.6019             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0356             nan     0.1000    0.0300
##      2        0.9887             nan     0.1000    0.0236
##      3        0.9516             nan     0.1000    0.0185
##      4        0.9193             nan     0.1000    0.0162
##      5        0.8919             nan     0.1000    0.0132
##      6        0.8699             nan     0.1000    0.0112
##      7        0.8477             nan     0.1000    0.0109
##      8        0.8291             nan     0.1000    0.0090
##      9        0.8131             nan     0.1000    0.0078
##     10        0.8015             nan     0.1000    0.0055
##     20        0.7186             nan     0.1000    0.0032
##     40        0.6518             nan     0.1000    0.0009
##     60        0.6236             nan     0.1000    0.0003
##     80        0.6103             nan     0.1000    0.0002
##    100        0.5994             nan     0.1000    0.0001
##    120        0.5924             nan     0.1000   -0.0000
##    140        0.5860             nan     0.1000    0.0001
##    150        0.5836             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0578             nan     0.1000    0.0187
##      2        1.0276             nan     0.1000    0.0150
##      3        1.0029             nan     0.1000    0.0122
##      4        0.9828             nan     0.1000    0.0100
##      5        0.9631             nan     0.1000    0.0098
##      6        0.9492             nan     0.1000    0.0067
##      7        0.9331             nan     0.1000    0.0081
##      8        0.9176             nan     0.1000    0.0078
##      9        0.9050             nan     0.1000    0.0063
##     10        0.8951             nan     0.1000    0.0048
##     20        0.8148             nan     0.1000    0.0022
##     40        0.7354             nan     0.1000    0.0015
##     60        0.6953             nan     0.1000    0.0005
##     80        0.6709             nan     0.1000    0.0003
##    100        0.6559             nan     0.1000    0.0002
##    120        0.6441             nan     0.1000    0.0003
##    140        0.6361             nan     0.1000    0.0001
##    150        0.6332             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0439             nan     0.1000    0.0258
##      2        1.0049             nan     0.1000    0.0199
##      3        0.9711             nan     0.1000    0.0165
##      4        0.9446             nan     0.1000    0.0131
##      5        0.9152             nan     0.1000    0.0148
##      6        0.8917             nan     0.1000    0.0121
##      7        0.8721             nan     0.1000    0.0097
##      8        0.8574             nan     0.1000    0.0070
##      9        0.8433             nan     0.1000    0.0070
##     10        0.8319             nan     0.1000    0.0056
##     20        0.7507             nan     0.1000    0.0031
##     40        0.6784             nan     0.1000    0.0012
##     60        0.6484             nan     0.1000    0.0004
##     80        0.6309             nan     0.1000    0.0003
##    100        0.6197             nan     0.1000    0.0001
##    120        0.6108             nan     0.1000    0.0001
##    140        0.6050             nan     0.1000    0.0002
##    150        0.6028             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0363             nan     0.1000    0.0291
##      2        0.9890             nan     0.1000    0.0236
##      3        0.9518             nan     0.1000    0.0188
##      4        0.9205             nan     0.1000    0.0160
##      5        0.8954             nan     0.1000    0.0123
##      6        0.8732             nan     0.1000    0.0113
##      7        0.8542             nan     0.1000    0.0093
##      8        0.8382             nan     0.1000    0.0079
##      9        0.8214             nan     0.1000    0.0082
##     10        0.8061             nan     0.1000    0.0078
##     20        0.7234             nan     0.1000    0.0023
##     40        0.6544             nan     0.1000    0.0008
##     60        0.6275             nan     0.1000    0.0005
##     80        0.6109             nan     0.1000    0.0001
##    100        0.6005             nan     0.1000    0.0000
##    120        0.5926             nan     0.1000    0.0001
##    140        0.5870             nan     0.1000    0.0000
##    150        0.5850             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0590             nan     0.1000    0.0184
##      2        1.0286             nan     0.1000    0.0148
##      3        1.0044             nan     0.1000    0.0120
##      4        0.9842             nan     0.1000    0.0099
##      5        0.9653             nan     0.1000    0.0096
##      6        0.9481             nan     0.1000    0.0085
##      7        0.9321             nan     0.1000    0.0078
##      8        0.9192             nan     0.1000    0.0064
##      9        0.9057             nan     0.1000    0.0064
##     10        0.8962             nan     0.1000    0.0045
##     20        0.8155             nan     0.1000    0.0028
##     40        0.7380             nan     0.1000    0.0009
##     60        0.6962             nan     0.1000    0.0007
##     80        0.6712             nan     0.1000    0.0005
##    100        0.6558             nan     0.1000    0.0004
##    120        0.6448             nan     0.1000    0.0001
##    140        0.6378             nan     0.1000    0.0001
##    150        0.6350             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0434             nan     0.1000    0.0257
##      2        1.0038             nan     0.1000    0.0197
##      3        0.9706             nan     0.1000    0.0163
##      4        0.9440             nan     0.1000    0.0131
##      5        0.9148             nan     0.1000    0.0145
##      6        0.8919             nan     0.1000    0.0115
##      7        0.8761             nan     0.1000    0.0077
##      8        0.8603             nan     0.1000    0.0079
##      9        0.8440             nan     0.1000    0.0081
##     10        0.8321             nan     0.1000    0.0058
##     20        0.7496             nan     0.1000    0.0033
##     40        0.6781             nan     0.1000    0.0013
##     60        0.6477             nan     0.1000    0.0003
##     80        0.6314             nan     0.1000    0.0002
##    100        0.6196             nan     0.1000    0.0003
##    120        0.6113             nan     0.1000    0.0002
##    140        0.6053             nan     0.1000    0.0001
##    150        0.6032             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0359             nan     0.1000    0.0300
##      2        0.9894             nan     0.1000    0.0233
##      3        0.9526             nan     0.1000    0.0186
##      4        0.9219             nan     0.1000    0.0152
##      5        0.8947             nan     0.1000    0.0134
##      6        0.8693             nan     0.1000    0.0126
##      7        0.8506             nan     0.1000    0.0095
##      8        0.8321             nan     0.1000    0.0091
##      9        0.8166             nan     0.1000    0.0076
##     10        0.8025             nan     0.1000    0.0069
##     20        0.7209             nan     0.1000    0.0029
##     40        0.6554             nan     0.1000    0.0011
##     60        0.6288             nan     0.1000    0.0004
##     80        0.6132             nan     0.1000    0.0000
##    100        0.6023             nan     0.1000    0.0001
##    120        0.5950             nan     0.1000    0.0001
##    140        0.5891             nan     0.1000    0.0002
##    150        0.5868             nan     0.1000   -0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0576             nan     0.1000    0.0187
##      2        1.0279             nan     0.1000    0.0150
##      3        1.0032             nan     0.1000    0.0122
##      4        0.9830             nan     0.1000    0.0099
##      5        0.9636             nan     0.1000    0.0098
##      6        0.9496             nan     0.1000    0.0070
##      7        0.9326             nan     0.1000    0.0083
##      8        0.9171             nan     0.1000    0.0078
##      9        0.9043             nan     0.1000    0.0064
##     10        0.8947             nan     0.1000    0.0048
##     20        0.8137             nan     0.1000    0.0031
##     40        0.7345             nan     0.1000    0.0013
##     60        0.6933             nan     0.1000    0.0005
##     80        0.6692             nan     0.1000    0.0006
##    100        0.6525             nan     0.1000    0.0001
##    120        0.6413             nan     0.1000    0.0004
##    140        0.6345             nan     0.1000    0.0003
##    150        0.6303             nan     0.1000    0.0003
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0427             nan     0.1000    0.0261
##      2        1.0033             nan     0.1000    0.0197
##      3        0.9705             nan     0.1000    0.0164
##      4        0.9435             nan     0.1000    0.0138
##      5        0.9144             nan     0.1000    0.0144
##      6        0.8914             nan     0.1000    0.0119
##      7        0.8719             nan     0.1000    0.0096
##      8        0.8569             nan     0.1000    0.0073
##      9        0.8450             nan     0.1000    0.0059
##     10        0.8323             nan     0.1000    0.0065
##     20        0.7495             nan     0.1000    0.0031
##     40        0.6751             nan     0.1000    0.0008
##     60        0.6454             nan     0.1000    0.0005
##     80        0.6285             nan     0.1000    0.0003
##    100        0.6174             nan     0.1000    0.0001
##    120        0.6084             nan     0.1000    0.0001
##    140        0.6018             nan     0.1000    0.0001
##    150        0.5993             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0360             nan     0.1000    0.0294
##      2        0.9888             nan     0.1000    0.0230
##      3        0.9519             nan     0.1000    0.0185
##      4        0.9189             nan     0.1000    0.0165
##      5        0.8933             nan     0.1000    0.0127
##      6        0.8682             nan     0.1000    0.0125
##      7        0.8490             nan     0.1000    0.0095
##      8        0.8307             nan     0.1000    0.0090
##      9        0.8141             nan     0.1000    0.0084
##     10        0.8000             nan     0.1000    0.0069
##     20        0.7168             nan     0.1000    0.0026
##     40        0.6492             nan     0.1000    0.0008
##     60        0.6242             nan     0.1000    0.0004
##     80        0.6081             nan     0.1000    0.0002
##    100        0.5975             nan     0.1000    0.0002
##    120        0.5901             nan     0.1000   -0.0000
##    140        0.5841             nan     0.1000    0.0000
##    150        0.5819             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0578             nan     0.1000    0.0186
##      2        1.0282             nan     0.1000    0.0150
##      3        1.0042             nan     0.1000    0.0122
##      4        0.9839             nan     0.1000    0.0097
##      5        0.9644             nan     0.1000    0.0098
##      6        0.9472             nan     0.1000    0.0085
##      7        0.9340             nan     0.1000    0.0062
##      8        0.9178             nan     0.1000    0.0078
##      9        0.9043             nan     0.1000    0.0063
##     10        0.8947             nan     0.1000    0.0046
##     20        0.8143             nan     0.1000    0.0031
##     40        0.7347             nan     0.1000    0.0015
##     60        0.6947             nan     0.1000    0.0009
##     80        0.6698             nan     0.1000    0.0004
##    100        0.6542             nan     0.1000    0.0002
##    120        0.6424             nan     0.1000    0.0001
##    140        0.6347             nan     0.1000    0.0001
##    150        0.6316             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0443             nan     0.1000    0.0262
##      2        1.0046             nan     0.1000    0.0199
##      3        0.9716             nan     0.1000    0.0167
##      4        0.9439             nan     0.1000    0.0139
##      5        0.9151             nan     0.1000    0.0146
##      6        0.8917             nan     0.1000    0.0118
##      7        0.8721             nan     0.1000    0.0096
##      8        0.8574             nan     0.1000    0.0075
##      9        0.8452             nan     0.1000    0.0061
##     10        0.8324             nan     0.1000    0.0064
##     20        0.7493             nan     0.1000    0.0024
##     40        0.6751             nan     0.1000    0.0012
##     60        0.6458             nan     0.1000    0.0003
##     80        0.6286             nan     0.1000    0.0005
##    100        0.6166             nan     0.1000    0.0004
##    120        0.6092             nan     0.1000    0.0001
##    140        0.6020             nan     0.1000    0.0000
##    150        0.5996             nan     0.1000    0.0000
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0365             nan     0.1000    0.0300
##      2        0.9892             nan     0.1000    0.0232
##      3        0.9524             nan     0.1000    0.0183
##      4        0.9203             nan     0.1000    0.0161
##      5        0.8943             nan     0.1000    0.0131
##      6        0.8719             nan     0.1000    0.0110
##      7        0.8527             nan     0.1000    0.0094
##      8        0.8336             nan     0.1000    0.0094
##      9        0.8171             nan     0.1000    0.0080
##     10        0.8034             nan     0.1000    0.0068
##     20        0.7198             nan     0.1000    0.0023
##     40        0.6533             nan     0.1000    0.0008
##     60        0.6264             nan     0.1000    0.0002
##     80        0.6089             nan     0.1000    0.0003
##    100        0.5989             nan     0.1000    0.0001
##    120        0.5904             nan     0.1000    0.0001
##    140        0.5842             nan     0.1000   -0.0000
##    150        0.5822             nan     0.1000    0.0001
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0364             nan     0.1000    0.0292
##      2        0.9899             nan     0.1000    0.0230
##      3        0.9530             nan     0.1000    0.0189
##      4        0.9207             nan     0.1000    0.0164
##      5        0.8942             nan     0.1000    0.0129
##      6        0.8729             nan     0.1000    0.0109
##      7        0.8514             nan     0.1000    0.0108
##      8        0.8331             nan     0.1000    0.0092
##      9        0.8200             nan     0.1000    0.0063
##     10        0.8074             nan     0.1000    0.0063
##     20        0.7214             nan     0.1000    0.0030
##     40        0.6544             nan     0.1000    0.0009
##     60        0.6264             nan     0.1000    0.0003
##     80        0.6110             nan     0.1000    0.0003
##    100        0.5996             nan     0.1000    0.0001
##    120        0.5926             nan     0.1000    0.0000
##    140        0.5873             nan     0.1000   -0.0000
##    150        0.5848             nan     0.1000    0.0001
summary(boostingtrain)

##                                         var     rel.inf
## Married.civ.spouse       Married.civ.spouse 36.52279385
## capital.gain                   capital.gain 19.98614726
## education.num                 education.num 19.43215771
## age                                     age  6.59421528
## capital.loss                   capital.loss  6.30308676
## hours.per.week               hours.per.week  4.36684944
## Exec.managerial             Exec.managerial  2.06255757
## Wife                                   Wife  0.69164792
## Prof.specialty               Prof.specialty  0.61752910
## Farming.fishing             Farming.fishing  0.55237470
## Self.emp.not.inc           Self.emp.not.inc  0.49422938
## Other.service                 Other.service  0.41444610
## Male                                   Male  0.39370763
## Tech.support                   Tech.support  0.32076657
## Married.AF.spouse         Married.AF.spouse  0.17353512
## Sales                                 Sales  0.16611691
## Local.gov                         Local.gov  0.14324173
## Not.in.family                 Not.in.family  0.13164689
## Self.emp.inc                   Self.emp.inc  0.09836303
## Machine.op.inspct         Machine.op.inspct  0.09136793
## White                                 White  0.08946089
## Protective.serv             Protective.serv  0.08671965
## United.States                 United.States  0.07037183
## Handlers.cleaners         Handlers.cleaners  0.06623173
## Own.child                         Own.child  0.05166759
## Philippines                     Philippines  0.04248981
## Widowed                             Widowed  0.01356904
## State.gov                         State.gov  0.01209187
## Transport.moving           Transport.moving  0.01061673
## No.gain                             No.gain  0.00000000
## Private                             Private  0.00000000
## Married.spouse.absent Married.spouse.absent  0.00000000
## Never.married                 Never.married  0.00000000
## Separated                         Separated  0.00000000
## Armed.Forces                   Armed.Forces  0.00000000
## Craft.repair                   Craft.repair  0.00000000
## Priv.house.serv             Priv.house.serv  0.00000000
## Other.relative               Other.relative  0.00000000
## Unmarried                         Unmarried  0.00000000
## Asian.Pac.Islander       Asian.Pac.Islander  0.00000000
## Black                                 Black  0.00000000
## Other                                 Other  0.00000000
## other_countries             other_countries  0.00000000
boostingtrain
## Stochastic Gradient Boosting 
## 
## 32402 samples
##    43 predictor
##     2 classes: '<=50K', '>50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8431146  0.4816248
##   1                  100      0.8524843  0.5377526
##   1                  150      0.8540584  0.5478247
##   2                   50      0.8532744  0.5423939
##   2                  100      0.8570582  0.5620259
##   2                  150      0.8604901  0.5781097
##   3                   50      0.8552002  0.5510964
##   3                  100      0.8612740  0.5805378
##   3                  150      0.8642429  0.5933076
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
boostingtrain$bestTune
##   n.trees interaction.depth shrinkage n.minobsinnode
## 9     150                 3       0.1             10
boostingtrain$results
##   shrinkage interaction.depth n.minobsinnode n.trees  Accuracy     Kappa
## 1       0.1                 1             10      50 0.8431146 0.4816248
## 4       0.1                 2             10      50 0.8532744 0.5423939
## 7       0.1                 3             10      50 0.8552002 0.5510964
## 2       0.1                 1             10     100 0.8524843 0.5377526
## 5       0.1                 2             10     100 0.8570582 0.5620259
## 8       0.1                 3             10     100 0.8612740 0.5805378
## 3       0.1                 1             10     150 0.8540584 0.5478247
## 6       0.1                 2             10     150 0.8604901 0.5781097
## 9       0.1                 3             10     150 0.8642429 0.5933076
##    AccuracySD    KappaSD
## 1 0.004809954 0.01989279
## 4 0.005088176 0.01718602
## 7 0.005030751 0.01692701
## 2 0.005360968 0.01868402
## 5 0.005063951 0.01689716
## 8 0.004921149 0.01572846
## 3 0.005253151 0.01767353
## 6 0.004820025 0.01558992
## 9 0.004983303 0.01554980
boostingtrain$finalModel
## A gradient boosted model with bernoulli loss function.
## 150 iterations were performed.
## There were 43 predictors of which 29 had non-zero influence.
boostingtrain$resample
##     Accuracy     Kappa    Resample
## 1  0.8731481 0.6167872 Fold04.Rep5
## 2  0.8673249 0.6005170 Fold01.Rep4
## 3  0.8623457 0.5871899 Fold03.Rep5
## 4  0.8756173 0.6304083 Fold10.Rep3
## 5  0.8604938 0.5754363 Fold05.Rep5
## 6  0.8672840 0.6073511 Fold02.Rep5
## 7  0.8608454 0.5856595 Fold09.Rep3
## 8  0.8567901 0.5705294 Fold02.Rep4
## 9  0.8642394 0.5943947 Fold01.Rep5
## 10 0.8614198 0.5842108 Fold08.Rep3
## 11 0.8716049 0.6145822 Fold09.Rep2
## 12 0.8568343 0.5676637 Fold06.Rep5
## 13 0.8608025 0.5851871 Fold10.Rep4
## 14 0.8604938 0.5733285 Fold07.Rep3
## 15 0.8694444 0.6113164 Fold08.Rep2
## 16 0.8673249 0.6051431 Fold03.Rep4
## 17 0.8638889 0.5904222 Fold09.Rep4
## 18 0.8577160 0.5739284 Fold08.Rep4
## 19 0.8611540 0.5835627 Fold07.Rep2
## 20 0.8657407 0.5955996 Fold10.Rep2
## 21 0.8722222 0.6190340 Fold07.Rep5
## 22 0.8632716 0.5905646 Fold07.Rep4
## 23 0.8660907 0.5987596 Fold06.Rep2
## 24 0.8570988 0.5737375 Fold07.Rep1
## 25 0.8672840 0.6012228 Fold04.Rep4
## 26 0.8604938 0.5808202 Fold04.Rep3
## 27 0.8632716 0.5873554 Fold05.Rep2
## 28 0.8703704 0.6108762 Fold06.Rep1
## 29 0.8645480 0.5970731 Fold01.Rep3
## 30 0.8679012 0.6050041 Fold08.Rep5
## 31 0.8580247 0.5670664 Fold06.Rep3
## 32 0.8638889 0.5924131 Fold04.Rep2
## 33 0.8608025 0.5827651 Fold08.Rep1
## 34 0.8626543 0.5871106 Fold05.Rep4
## 35 0.8641975 0.5962809 Fold05.Rep3
## 36 0.8651235 0.5925450 Fold03.Rep2
## 37 0.8663580 0.5982447 Fold04.Rep1
## 38 0.8614198 0.5901983 Fold02.Rep3
## 39 0.8657407 0.5952038 Fold09.Rep5
## 40 0.8580247 0.5771138 Fold02.Rep2
## 41 0.8645062 0.5954414 Fold03.Rep1
## 42 0.8707189 0.6131911 Fold09.Rep1
## 43 0.8719136 0.6138151 Fold06.Rep4
## 44 0.8558642 0.5671205 Fold01.Rep2
## 45 0.8651651 0.5957897 Fold02.Rep1
## 46 0.8558642 0.5666981 Fold05.Rep1
## 47 0.8688272 0.6113568 Fold03.Rep3
## 48 0.8570988 0.5849905 Fold10.Rep5
## 49 0.8688272 0.6102321 Fold10.Rep1
## 50 0.8700617 0.6101397 Fold01.Rep1
boostingtrain$resampledCM
##     shrinkage interaction.depth n.minobsinnode n.trees cell1 cell2 cell3
## 1         0.1                 1             10     150  2357   115   352
## 2         0.1                 1             10      50  2402    70   427
## 3         0.1                 1             10     100  2364   108   363
## 4         0.1                 2             10     150  2355   117   313
## 5         0.1                 2             10      50  2363   109   362
## 6         0.1                 2             10     100  2362   110   334
## 7         0.1                 3             10     150  2349   123   298
## 8         0.1                 3             10      50  2359   113   355
## 9         0.1                 3             10     100  2354   118   320
## 10        0.1                 1             10     150  2354   118   356
## 11        0.1                 1             10      50  2404    68   458
## 12        0.1                 1             10     100  2363   109   371
## 13        0.1                 2             10     150  2340   132   324
## 14        0.1                 2             10      50  2366   106   361
## 15        0.1                 2             10     100  2349   123   340
## 16        0.1                 3             10     150  2341   131   306
## 17        0.1                 3             10      50  2356   116   354
## 18        0.1                 3             10     100  2338   134   318
## 19        0.1                 1             10     150  2365   107   357
## 20        0.1                 1             10      50  2405    67   440
## 21        0.1                 1             10     100  2382    90   372
## 22        0.1                 2             10     150  2345   127   322
## 23        0.1                 2             10      50  2374    98   368
## 24        0.1                 2             10     100  2360   112   346
## 25        0.1                 3             10     150  2335   137   302
## 26        0.1                 3             10      50  2370   102   360
## 27        0.1                 3             10     100  2342   130   320
## 28        0.1                 1             10     150  2340   132   353
## 29        0.1                 1             10      50  2401    71   428
## 30        0.1                 1             10     100  2352   120   369
## 31        0.1                 2             10     150  2341   131   322
## 32        0.1                 2             10      50  2344   128   360
## 33        0.1                 2             10     100  2339   133   338
## 34        0.1                 3             10     150  2345   127   306
## 35        0.1                 3             10      50  2347   125   356
## 36        0.1                 3             10     100  2345   127   325
## 37        0.1                 1             10     150  2343   129   371
## 38        0.1                 1             10      50  2394    78   437
## 39        0.1                 1             10     100  2346   126   374
## 40        0.1                 2             10     150  2325   147   331
## 41        0.1                 2             10      50  2345   127   370
## 42        0.1                 2             10     100  2343   129   353
## 43        0.1                 3             10     150  2328   144   323
## 44        0.1                 3             10      50  2343   129   363
## 45        0.1                 3             10     100  2331   141   336
## 46        0.1                 1             10     150  2360   112   347
## 47        0.1                 1             10      50  2404    68   435
## 48        0.1                 1             10     100  2368   104   359
## 49        0.1                 2             10     150  2346   126   311
## 50        0.1                 2             10      50  2362   110   359
## 51        0.1                 2             10     100  2349   123   336
## 52        0.1                 3             10     150  2350   122   298
## 53        0.1                 3             10      50  2362   110   347
## 54        0.1                 3             10     100  2351   121   311
## 55        0.1                 1             10     150  2347   125   364
## 56        0.1                 1             10      50  2360   112   426
## 57        0.1                 1             10     100  2354   118   374
## 58        0.1                 2             10     150  2324   148   323
## 59        0.1                 2             10      50  2342   130   359
## 60        0.1                 2             10     100  2333   139   341
## 61        0.1                 3             10     150  2322   150   313
## 62        0.1                 3             10      50  2342   130   357
## 63        0.1                 3             10     100  2329   143   327
## 64        0.1                 1             10     150  2354   118   362
## 65        0.1                 1             10      50  2408    64   449
## 66        0.1                 1             10     100  2373    99   391
## 67        0.1                 2             10     150  2341   131   330
## 68        0.1                 2             10      50  2368   104   378
## 69        0.1                 2             10     100  2349   123   348
## 70        0.1                 3             10     150  2333   139   312
## 71        0.1                 3             10      50  2362   110   366
## 72        0.1                 3             10     100  2346   126   321
## 73        0.1                 1             10     150  2364   108   360
## 74        0.1                 1             10      50  2410    62   438
## 75        0.1                 1             10     100  2371   101   373
## 76        0.1                 2             10     150  2352   120   314
## 77        0.1                 2             10      50  2372   100   366
## 78        0.1                 2             10     100  2360   112   336
## 79        0.1                 3             10     150  2348   124   295
## 80        0.1                 3             10      50  2366   106   350
## 81        0.1                 3             10     100  2350   122   317
## 82        0.1                 1             10     150  2365   107   347
## 83        0.1                 1             10      50  2403    69   429
## 84        0.1                 1             10     100  2374    98   358
## 85        0.1                 2             10     150  2337   135   309
## 86        0.1                 2             10      50  2371   101   356
## 87        0.1                 2             10     100  2355   117   335
## 88        0.1                 3             10     150  2337   135   290
## 89        0.1                 3             10      50  2367   105   352
## 90        0.1                 3             10     100  2345   127   308
## 91        0.1                 1             10     150  2352   120   367
## 92        0.1                 1             10      50  2403    69   457
## 93        0.1                 1             10     100  2364   108   393
## 94        0.1                 2             10     150  2330   142   332
## 95        0.1                 2             10      50  2361   111   379
## 96        0.1                 2             10     100  2339   133   352
## 97        0.1                 3             10     150  2327   145   322
## 98        0.1                 3             10      50  2351   121   368
## 99        0.1                 3             10     100  2334   138   334
## 100       0.1                 1             10     150  2336   136   361
## 101       0.1                 1             10      50  2387    85   434
## 102       0.1                 1             10     100  2343   129   374
## 103       0.1                 2             10     150  2323   149   317
## 104       0.1                 2             10      50  2342   130   367
## 105       0.1                 2             10     100  2333   139   348
## 106       0.1                 3             10     150  2322   150   310
## 107       0.1                 3             10      50  2342   130   360
## 108       0.1                 3             10     100  2325   147   324
## 109       0.1                 1             10     150  2354   118   353
## 110       0.1                 1             10      50  2402    70   425
## 111       0.1                 1             10     100  2362   110   363
## 112       0.1                 2             10     150  2351   121   328
## 113       0.1                 2             10      50  2359   113   357
## 114       0.1                 2             10     100  2356   116   333
## 115       0.1                 3             10     150  2348   124   313
## 116       0.1                 3             10      50  2360   112   351
## 117       0.1                 3             10     100  2351   121   321
## 118       0.1                 1             10     150  2361   111   353
## 119       0.1                 1             10      50  2399    73   426
## 120       0.1                 1             10     100  2365   107   363
## 121       0.1                 2             10     150  2342   130   315
## 122       0.1                 2             10      50  2360   112   360
## 123       0.1                 2             10     100  2351   121   336
## 124       0.1                 3             10     150  2337   135   306
## 125       0.1                 3             10      50  2359   113   357
## 126       0.1                 3             10     100  2343   129   315
## 127       0.1                 1             10     150  2348   124   365
## 128       0.1                 1             10      50  2396    76   440
## 129       0.1                 1             10     100  2356   116   373
## 130       0.1                 2             10     150  2344   128   323
## 131       0.1                 2             10      50  2359   113   371
## 132       0.1                 2             10     100  2349   123   351
## 133       0.1                 3             10     150  2344   128   315
## 134       0.1                 3             10      50  2355   117   361
## 135       0.1                 3             10     100  2346   126   326
## 136       0.1                 1             10     150  2364   108   344
## 137       0.1                 1             10      50  2404    68   428
## 138       0.1                 1             10     100  2370   102   356
## 139       0.1                 2             10     150  2339   133   315
## 140       0.1                 2             10      50  2365   107   354
## 141       0.1                 2             10     100  2346   126   333
## 142       0.1                 3             10     150  2342   130   304
## 143       0.1                 3             10      50  2359   113   344
## 144       0.1                 3             10     100  2343   129   314
## 145       0.1                 1             10     150  2353   119   366
## 146       0.1                 1             10      50  2399    73   428
## 147       0.1                 1             10     100  2368   104   379
## 148       0.1                 2             10     150  2335   137   329
## 149       0.1                 2             10      50  2351   121   365
## 150       0.1                 2             10     100  2348   124   346
## 151       0.1                 3             10     150  2335   137   313
## 152       0.1                 3             10      50  2349   123   357
## 153       0.1                 3             10     100  2339   133   328
## 154       0.1                 1             10     150  2354   118   344
## 155       0.1                 1             10      50  2408    64   445
## 156       0.1                 1             10     100  2361   111   360
## 157       0.1                 2             10     150  2344   128   313
## 158       0.1                 2             10      50  2367   105   355
## 159       0.1                 2             10     100  2350   122   327
## 160       0.1                 3             10     150  2340   132   291
## 161       0.1                 3             10      50  2363   109   340
## 162       0.1                 3             10     100  2346   126   309
## 163       0.1                 1             10     150  2368   104   360
## 164       0.1                 1             10      50  2416    56   429
## 165       0.1                 1             10     100  2379    93   370
## 166       0.1                 2             10     150  2349   123   314
## 167       0.1                 2             10      50  2374    98   370
## 168       0.1                 2             10     100  2356   116   340
## 169       0.1                 3             10     150  2352   120   296
## 170       0.1                 3             10      50  2372   100   361
## 171       0.1                 3             10     100  2353   119   317
## 172       0.1                 1             10     150  2368   104   355
## 173       0.1                 1             10      50  2405    67   456
## 174       0.1                 1             10     100  2372   100   371
## 175       0.1                 2             10     150  2345   127   322
## 176       0.1                 2             10      50  2368   104   364
## 177       0.1                 2             10     100  2362   110   341
## 178       0.1                 3             10     150  2346   126   309
## 179       0.1                 3             10      50  2372   100   359
## 180       0.1                 3             10     100  2350   122   315
## 181       0.1                 1             10     150  2350   122   337
## 182       0.1                 1             10      50  2362   110   385
## 183       0.1                 1             10     100  2353   119   349
## 184       0.1                 2             10     150  2340   132   312
## 185       0.1                 2             10      50  2349   123   342
## 186       0.1                 2             10     100  2345   127   323
## 187       0.1                 3             10     150  2332   140   299
## 188       0.1                 3             10      50  2348   124   329
## 189       0.1                 3             10     100  2337   135   309
## 190       0.1                 1             10     150  2345   127   336
## 191       0.1                 1             10      50  2388    84   427
## 192       0.1                 1             10     100  2358   114   355
## 193       0.1                 2             10     150  2329   143   308
## 194       0.1                 2             10      50  2355   117   345
## 195       0.1                 2             10     100  2342   130   320
## 196       0.1                 3             10     150  2320   152   297
## 197       0.1                 3             10      50  2351   121   336
## 198       0.1                 3             10     100  2331   141   303
## 199       0.1                 1             10     150  2355   117   336
## 200       0.1                 1             10      50  2409    63   438
## 201       0.1                 1             10     100  2365   107   356
## 202       0.1                 2             10     150  2347   125   303
## 203       0.1                 2             10      50  2362   110   355
## 204       0.1                 2             10     100  2349   123   324
## 205       0.1                 3             10     150  2334   138   287
## 206       0.1                 3             10      50  2360   112   341
## 207       0.1                 3             10     100  2342   130   304
## 208       0.1                 1             10     150  2349   123   366
## 209       0.1                 1             10      50  2404    68   453
## 210       0.1                 1             10     100  2363   109   383
## 211       0.1                 2             10     150  2343   129   327
## 212       0.1                 2             10      50  2360   112   385
## 213       0.1                 2             10     100  2346   126   341
## 214       0.1                 3             10     150  2335   137   315
## 215       0.1                 3             10      50  2355   117   366
## 216       0.1                 3             10     100  2339   133   325
## 217       0.1                 1             10     150  2345   127   356
## 218       0.1                 1             10      50  2390    82   433
## 219       0.1                 1             10     100  2346   126   363
## 220       0.1                 2             10     150  2333   139   313
## 221       0.1                 2             10      50  2346   126   356
## 222       0.1                 2             10     100  2340   132   335
## 223       0.1                 3             10     150  2330   142   298
## 224       0.1                 3             10      50  2344   128   351
## 225       0.1                 3             10     100  2336   136   315
## 226       0.1                 1             10     150  2367   105   392
## 227       0.1                 1             10      50  2409    63   469
## 228       0.1                 1             10     100  2379    93   409
## 229       0.1                 2             10     150  2352   120   353
## 230       0.1                 2             10      50  2375    97   404
## 231       0.1                 2             10     100  2360   112   375
## 232       0.1                 3             10     150  2346   126   334
## 233       0.1                 3             10      50  2367   105   391
## 234       0.1                 3             10     100  2353   119   352
## 235       0.1                 1             10     150  2365   107   389
## 236       0.1                 1             10      50  2413    59   464
## 237       0.1                 1             10     100  2375    97   396
## 238       0.1                 2             10     150  2352   120   351
## 239       0.1                 2             10      50  2375    97   389
## 240       0.1                 2             10     100  2352   120   365
## 241       0.1                 3             10     150  2353   119   333
## 242       0.1                 3             10      50  2376    96   383
## 243       0.1                 3             10     100  2356   116   350
## 244       0.1                 1             10     150  2366   106   351
## 245       0.1                 1             10      50  2410    62   431
## 246       0.1                 1             10     100  2376    96   366
## 247       0.1                 2             10     150  2348   124   327
## 248       0.1                 2             10      50  2374    98   361
## 249       0.1                 2             10     100  2359   113   339
## 250       0.1                 3             10     150  2335   137   312
## 251       0.1                 3             10      50  2373    99   354
## 252       0.1                 3             10     100  2346   126   326
## 253       0.1                 1             10     150  2346   126   355
## 254       0.1                 1             10      50  2373    99   400
## 255       0.1                 1             10     100  2356   116   365
## 256       0.1                 2             10     150  2331   141   326
## 257       0.1                 2             10      50  2351   121   359
## 258       0.1                 2             10     100  2337   135   342
## 259       0.1                 3             10     150  2327   145   306
## 260       0.1                 3             10      50  2354   118   352
## 261       0.1                 3             10     100  2323   149   321
## 262       0.1                 1             10     150  2361   111   344
## 263       0.1                 1             10      50  2400    72   418
## 264       0.1                 1             10     100  2370   102   360
## 265       0.1                 2             10     150  2357   115   305
## 266       0.1                 2             10      50  2372   100   359
## 267       0.1                 2             10     100  2357   115   333
## 268       0.1                 3             10     150  2348   124   279
## 269       0.1                 3             10      50  2367   105   354
## 270       0.1                 3             10     100  2362   110   300
## 271       0.1                 1             10     150  2365   107   360
## 272       0.1                 1             10      50  2408    64   436
## 273       0.1                 1             10     100  2369   103   364
## 274       0.1                 2             10     150  2344   128   315
## 275       0.1                 2             10      50  2364   108   352
## 276       0.1                 2             10     100  2356   116   339
## 277       0.1                 3             10     150  2349   123   307
## 278       0.1                 3             10      50  2365   107   350
## 279       0.1                 3             10     100  2353   119   319
## 280       0.1                 1             10     150  2347   125   376
## 281       0.1                 1             10      50  2399    73   479
## 282       0.1                 1             10     100  2354   118   390
## 283       0.1                 2             10     150  2325   147   335
## 284       0.1                 2             10      50  2354   118   385
## 285       0.1                 2             10     100  2336   136   361
## 286       0.1                 3             10     150  2327   145   319
## 287       0.1                 3             10      50  2346   126   373
## 288       0.1                 3             10     100  2326   146   335
## 289       0.1                 1             10     150  2357   115   349
## 290       0.1                 1             10      50  2408    64   431
## 291       0.1                 1             10     100  2371   101   355
## 292       0.1                 2             10     150  2339   133   309
## 293       0.1                 2             10      50  2370   102   355
## 294       0.1                 2             10     100  2352   120   328
## 295       0.1                 3             10     150  2337   135   295
## 296       0.1                 3             10      50  2363   109   339
## 297       0.1                 3             10     100  2346   126   305
## 298       0.1                 1             10     150  2365   107   358
## 299       0.1                 1             10      50  2378    94   414
## 300       0.1                 1             10     100  2375    97   382
## 301       0.1                 2             10     150  2356   116   325
## 302       0.1                 2             10      50  2369   103   368
## 303       0.1                 2             10     100  2365   107   340
## 304       0.1                 3             10     150  2346   126   304
## 305       0.1                 3             10      50  2368   104   360
## 306       0.1                 3             10     100  2349   123   318
## 307       0.1                 1             10     150  2365   107   354
## 308       0.1                 1             10      50  2400    72   441
## 309       0.1                 1             10     100  2364   108   373
## 310       0.1                 2             10     150  2343   129   323
## 311       0.1                 2             10      50  2359   113   365
## 312       0.1                 2             10     100  2353   119   341
## 313       0.1                 3             10     150  2339   133   312
## 314       0.1                 3             10      50  2358   114   354
## 315       0.1                 3             10     100  2350   122   323
## 316       0.1                 1             10     150  2365   107   336
## 317       0.1                 1             10      50  2417    55   429
## 318       0.1                 1             10     100  2376    96   356
## 319       0.1                 2             10     150  2358   114   303
## 320       0.1                 2             10      50  2370   102   351
## 321       0.1                 2             10     100  2353   119   316
## 322       0.1                 3             10     150  2357   115   300
## 323       0.1                 3             10      50  2362   110   341
## 324       0.1                 3             10     100  2359   113   305
## 325       0.1                 1             10     150  2349   123   345
## 326       0.1                 1             10      50  2395    77   422
## 327       0.1                 1             10     100  2362   110   356
## 328       0.1                 2             10     150  2339   133   321
## 329       0.1                 2             10      50  2358   114   351
## 330       0.1                 2             10     100  2346   126   329
## 331       0.1                 3             10     150  2336   136   307
## 332       0.1                 3             10      50  2360   112   340
## 333       0.1                 3             10     100  2335   137   317
## 334       0.1                 1             10     150  2351   121   372
## 335       0.1                 1             10      50  2393    79   465
## 336       0.1                 1             10     100  2353   119   388
## 337       0.1                 2             10     150  2335   137   325
## 338       0.1                 2             10      50  2351   121   378
## 339       0.1                 2             10     100  2348   124   359
## 340       0.1                 3             10     150  2327   145   316
## 341       0.1                 3             10      50  2350   122   368
## 342       0.1                 3             10     100  2335   137   330
## 343       0.1                 1             10     150  2357   115   348
## 344       0.1                 1             10      50  2399    73   422
## 345       0.1                 1             10     100  2367   105   358
## 346       0.1                 2             10     150  2341   131   327
## 347       0.1                 2             10      50  2361   111   359
## 348       0.1                 2             10     100  2349   123   345
## 349       0.1                 3             10     150  2342   130   311
## 350       0.1                 3             10      50  2359   113   352
## 351       0.1                 3             10     100  2349   123   327
## 352       0.1                 1             10     150  2341   131   367
## 353       0.1                 1             10      50  2387    85   436
## 354       0.1                 1             10     100  2349   123   381
## 355       0.1                 2             10     150  2332   140   330
## 356       0.1                 2             10      50  2346   126   377
## 357       0.1                 2             10     100  2337   135   350
## 358       0.1                 3             10     150  2327   145   306
## 359       0.1                 3             10      50  2344   128   367
## 360       0.1                 3             10     100  2332   140   328
## 361       0.1                 1             10     150  2352   120   355
## 362       0.1                 1             10      50  2398    74   436
## 363       0.1                 1             10     100  2364   108   369
## 364       0.1                 2             10     150  2347   125   331
## 365       0.1                 2             10      50  2364   108   367
## 366       0.1                 2             10     100  2349   123   348
## 367       0.1                 3             10     150  2336   136   304
## 368       0.1                 3             10      50  2355   117   360
## 369       0.1                 3             10     100  2343   129   330
## 370       0.1                 1             10     150  2357   115   343
## 371       0.1                 1             10      50  2401    71   424
## 372       0.1                 1             10     100  2363   109   350
## 373       0.1                 2             10     150  2342   130   309
## 374       0.1                 2             10      50  2361   111   338
## 375       0.1                 2             10     100  2350   122   320
## 376       0.1                 3             10     150  2330   142   288
## 377       0.1                 3             10      50  2358   114   329
## 378       0.1                 3             10     100  2337   135   304
## 379       0.1                 1             10     150  2348   124   364
## 380       0.1                 1             10      50  2398    74   449
## 381       0.1                 1             10     100  2357   115   379
## 382       0.1                 2             10     150  2339   133   335
## 383       0.1                 2             10      50  2358   114   378
## 384       0.1                 2             10     100  2346   126   353
## 385       0.1                 3             10     150  2336   136   310
## 386       0.1                 3             10      50  2348   124   361
## 387       0.1                 3             10     100  2336   136   323
## 388       0.1                 1             10     150  2378    94   349
## 389       0.1                 1             10      50  2414    58   438
## 390       0.1                 1             10     100  2380    92   363
## 391       0.1                 2             10     150  2364   108   320
## 392       0.1                 2             10      50  2380    92   360
## 393       0.1                 2             10     100  2376    96   335
## 394       0.1                 3             10     150  2361   111   300
## 395       0.1                 3             10      50  2376    96   348
## 396       0.1                 3             10     100  2359   113   316
## 397       0.1                 1             10     150  2356   116   375
## 398       0.1                 1             10      50  2411    61   447
## 399       0.1                 1             10     100  2364   108   390
## 400       0.1                 2             10     150  2343   129   348
## 401       0.1                 2             10      50  2357   115   378
## 402       0.1                 2             10     100  2345   127   358
## 403       0.1                 3             10     150  2348   124   328
## 404       0.1                 3             10      50  2359   113   372
## 405       0.1                 3             10     100  2351   121   344
## 406       0.1                 1             10     150  2365   107   383
## 407       0.1                 1             10      50  2404    68   467
## 408       0.1                 1             10     100  2371   101   397
## 409       0.1                 2             10     150  2342   130   344
## 410       0.1                 2             10      50  2362   110   387
## 411       0.1                 2             10     100  2353   119   369
## 412       0.1                 3             10     150  2335   137   327
## 413       0.1                 3             10      50  2367   105   385
## 414       0.1                 3             10     100  2343   129   338
## 415       0.1                 1             10     150  2380    92   336
## 416       0.1                 1             10      50  2410    62   418
## 417       0.1                 1             10     100  2384    88   343
## 418       0.1                 2             10     150  2350   122   305
## 419       0.1                 2             10      50  2384    88   342
## 420       0.1                 2             10     100  2366   106   325
## 421       0.1                 3             10     150  2346   126   288
## 422       0.1                 3             10      50  2379    93   336
## 423       0.1                 3             10     100  2357   115   304
## 424       0.1                 1             10     150  2356   116   361
## 425       0.1                 1             10      50  2406    66   440
## 426       0.1                 1             10     100  2369   103   383
## 427       0.1                 2             10     150  2348   124   324
## 428       0.1                 2             10      50  2362   110   371
## 429       0.1                 2             10     100  2350   122   337
## 430       0.1                 3             10     150  2342   130   298
## 431       0.1                 3             10      50  2361   111   358
## 432       0.1                 3             10     100  2346   126   317
## 433       0.1                 1             10     150  2349   123   363
## 434       0.1                 1             10      50  2401    71   438
## 435       0.1                 1             10     100  2356   116   380
## 436       0.1                 2             10     150  2343   129   325
## 437       0.1                 2             10      50  2355   117   371
## 438       0.1                 2             10     100  2345   127   347
## 439       0.1                 3             10     150  2347   125   310
## 440       0.1                 3             10      50  2355   117   362
## 441       0.1                 3             10     100  2340   132   324
## 442       0.1                 1             10     150  2319   153   329
## 443       0.1                 1             10      50  2385    87   412
## 444       0.1                 1             10     100  2331   141   341
## 445       0.1                 2             10     150  2310   162   296
## 446       0.1                 2             10      50  2324   148   343
## 447       0.1                 2             10     100  2313   159   311
## 448       0.1                 3             10     150  2294   178   285
## 449       0.1                 3             10      50  2316   156   329
## 450       0.1                 3             10     100  2305   167   294
##     cell4    Resample
## 1     416 Fold01.Rep1
## 2     341 Fold01.Rep1
## 3     405 Fold01.Rep1
## 4     455 Fold01.Rep1
## 5     406 Fold01.Rep1
## 6     434 Fold01.Rep1
## 7     470 Fold01.Rep1
## 8     413 Fold01.Rep1
## 9     448 Fold01.Rep1
## 10    413 Fold02.Rep1
## 11    311 Fold02.Rep1
## 12    398 Fold02.Rep1
## 13    445 Fold02.Rep1
## 14    408 Fold02.Rep1
## 15    429 Fold02.Rep1
## 16    463 Fold02.Rep1
## 17    415 Fold02.Rep1
## 18    451 Fold02.Rep1
## 19    411 Fold03.Rep1
## 20    328 Fold03.Rep1
## 21    396 Fold03.Rep1
## 22    446 Fold03.Rep1
## 23    400 Fold03.Rep1
## 24    422 Fold03.Rep1
## 25    466 Fold03.Rep1
## 26    408 Fold03.Rep1
## 27    448 Fold03.Rep1
## 28    415 Fold04.Rep1
## 29    340 Fold04.Rep1
## 30    399 Fold04.Rep1
## 31    446 Fold04.Rep1
## 32    408 Fold04.Rep1
## 33    430 Fold04.Rep1
## 34    462 Fold04.Rep1
## 35    412 Fold04.Rep1
## 36    443 Fold04.Rep1
## 37    397 Fold05.Rep1
## 38    331 Fold05.Rep1
## 39    394 Fold05.Rep1
## 40    437 Fold05.Rep1
## 41    398 Fold05.Rep1
## 42    415 Fold05.Rep1
## 43    445 Fold05.Rep1
## 44    405 Fold05.Rep1
## 45    432 Fold05.Rep1
## 46    421 Fold06.Rep1
## 47    333 Fold06.Rep1
## 48    409 Fold06.Rep1
## 49    457 Fold06.Rep1
## 50    409 Fold06.Rep1
## 51    432 Fold06.Rep1
## 52    470 Fold06.Rep1
## 53    421 Fold06.Rep1
## 54    457 Fold06.Rep1
## 55    404 Fold07.Rep1
## 56    342 Fold07.Rep1
## 57    394 Fold07.Rep1
## 58    445 Fold07.Rep1
## 59    409 Fold07.Rep1
## 60    427 Fold07.Rep1
## 61    455 Fold07.Rep1
## 62    411 Fold07.Rep1
## 63    441 Fold07.Rep1
## 64    406 Fold08.Rep1
## 65    319 Fold08.Rep1
## 66    377 Fold08.Rep1
## 67    438 Fold08.Rep1
## 68    390 Fold08.Rep1
## 69    420 Fold08.Rep1
## 70    456 Fold08.Rep1
## 71    402 Fold08.Rep1
## 72    447 Fold08.Rep1
## 73    409 Fold09.Rep1
## 74    331 Fold09.Rep1
## 75    396 Fold09.Rep1
## 76    455 Fold09.Rep1
## 77    403 Fold09.Rep1
## 78    433 Fold09.Rep1
## 79    474 Fold09.Rep1
## 80    419 Fold09.Rep1
## 81    452 Fold09.Rep1
## 82    421 Fold10.Rep1
## 83    339 Fold10.Rep1
## 84    410 Fold10.Rep1
## 85    459 Fold10.Rep1
## 86    412 Fold10.Rep1
## 87    433 Fold10.Rep1
## 88    478 Fold10.Rep1
## 89    416 Fold10.Rep1
## 90    460 Fold10.Rep1
## 91    401 Fold01.Rep2
## 92    311 Fold01.Rep2
## 93    375 Fold01.Rep2
## 94    436 Fold01.Rep2
## 95    389 Fold01.Rep2
## 96    416 Fold01.Rep2
## 97    446 Fold01.Rep2
## 98    400 Fold01.Rep2
## 99    434 Fold01.Rep2
## 100   407 Fold02.Rep2
## 101   334 Fold02.Rep2
## 102   394 Fold02.Rep2
## 103   451 Fold02.Rep2
## 104   401 Fold02.Rep2
## 105   420 Fold02.Rep2
## 106   458 Fold02.Rep2
## 107   408 Fold02.Rep2
## 108   444 Fold02.Rep2
## 109   415 Fold03.Rep2
## 110   343 Fold03.Rep2
## 111   405 Fold03.Rep2
## 112   440 Fold03.Rep2
## 113   411 Fold03.Rep2
## 114   435 Fold03.Rep2
## 115   455 Fold03.Rep2
## 116   417 Fold03.Rep2
## 117   447 Fold03.Rep2
## 118   415 Fold04.Rep2
## 119   342 Fold04.Rep2
## 120   405 Fold04.Rep2
## 121   453 Fold04.Rep2
## 122   408 Fold04.Rep2
## 123   432 Fold04.Rep2
## 124   462 Fold04.Rep2
## 125   411 Fold04.Rep2
## 126   453 Fold04.Rep2
## 127   403 Fold05.Rep2
## 128   328 Fold05.Rep2
## 129   395 Fold05.Rep2
## 130   445 Fold05.Rep2
## 131   397 Fold05.Rep2
## 132   417 Fold05.Rep2
## 133   453 Fold05.Rep2
## 134   407 Fold05.Rep2
## 135   442 Fold05.Rep2
## 136   425 Fold06.Rep2
## 137   341 Fold06.Rep2
## 138   413 Fold06.Rep2
## 139   454 Fold06.Rep2
## 140   415 Fold06.Rep2
## 141   436 Fold06.Rep2
## 142   465 Fold06.Rep2
## 143   425 Fold06.Rep2
## 144   455 Fold06.Rep2
## 145   403 Fold07.Rep2
## 146   341 Fold07.Rep2
## 147   390 Fold07.Rep2
## 148   440 Fold07.Rep2
## 149   404 Fold07.Rep2
## 150   423 Fold07.Rep2
## 151   456 Fold07.Rep2
## 152   412 Fold07.Rep2
## 153   441 Fold07.Rep2
## 154   424 Fold08.Rep2
## 155   323 Fold08.Rep2
## 156   408 Fold08.Rep2
## 157   455 Fold08.Rep2
## 158   413 Fold08.Rep2
## 159   441 Fold08.Rep2
## 160   477 Fold08.Rep2
## 161   428 Fold08.Rep2
## 162   459 Fold08.Rep2
## 163   408 Fold09.Rep2
## 164   339 Fold09.Rep2
## 165   398 Fold09.Rep2
## 166   454 Fold09.Rep2
## 167   398 Fold09.Rep2
## 168   428 Fold09.Rep2
## 169   472 Fold09.Rep2
## 170   407 Fold09.Rep2
## 171   451 Fold09.Rep2
## 172   413 Fold10.Rep2
## 173   312 Fold10.Rep2
## 174   397 Fold10.Rep2
## 175   446 Fold10.Rep2
## 176   404 Fold10.Rep2
## 177   427 Fold10.Rep2
## 178   459 Fold10.Rep2
## 179   409 Fold10.Rep2
## 180   453 Fold10.Rep2
## 181   432 Fold01.Rep3
## 182   384 Fold01.Rep3
## 183   420 Fold01.Rep3
## 184   457 Fold01.Rep3
## 185   427 Fold01.Rep3
## 186   446 Fold01.Rep3
## 187   470 Fold01.Rep3
## 188   440 Fold01.Rep3
## 189   460 Fold01.Rep3
## 190   432 Fold02.Rep3
## 191   341 Fold02.Rep3
## 192   413 Fold02.Rep3
## 193   460 Fold02.Rep3
## 194   423 Fold02.Rep3
## 195   448 Fold02.Rep3
## 196   471 Fold02.Rep3
## 197   432 Fold02.Rep3
## 198   465 Fold02.Rep3
## 199   432 Fold03.Rep3
## 200   330 Fold03.Rep3
## 201   412 Fold03.Rep3
## 202   465 Fold03.Rep3
## 203   413 Fold03.Rep3
## 204   444 Fold03.Rep3
## 205   481 Fold03.Rep3
## 206   427 Fold03.Rep3
## 207   464 Fold03.Rep3
## 208   402 Fold04.Rep3
## 209   315 Fold04.Rep3
## 210   385 Fold04.Rep3
## 211   441 Fold04.Rep3
## 212   383 Fold04.Rep3
## 213   427 Fold04.Rep3
## 214   453 Fold04.Rep3
## 215   402 Fold04.Rep3
## 216   443 Fold04.Rep3
## 217   412 Fold05.Rep3
## 218   335 Fold05.Rep3
## 219   405 Fold05.Rep3
## 220   455 Fold05.Rep3
## 221   412 Fold05.Rep3
## 222   433 Fold05.Rep3
## 223   470 Fold05.Rep3
## 224   417 Fold05.Rep3
## 225   453 Fold05.Rep3
## 226   376 Fold06.Rep3
## 227   299 Fold06.Rep3
## 228   359 Fold06.Rep3
## 229   415 Fold06.Rep3
## 230   364 Fold06.Rep3
## 231   393 Fold06.Rep3
## 232   434 Fold06.Rep3
## 233   377 Fold06.Rep3
## 234   416 Fold06.Rep3
## 235   379 Fold07.Rep3
## 236   304 Fold07.Rep3
## 237   372 Fold07.Rep3
## 238   417 Fold07.Rep3
## 239   379 Fold07.Rep3
## 240   403 Fold07.Rep3
## 241   435 Fold07.Rep3
## 242   385 Fold07.Rep3
## 243   418 Fold07.Rep3
## 244   417 Fold08.Rep3
## 245   337 Fold08.Rep3
## 246   402 Fold08.Rep3
## 247   441 Fold08.Rep3
## 248   407 Fold08.Rep3
## 249   429 Fold08.Rep3
## 250   456 Fold08.Rep3
## 251   414 Fold08.Rep3
## 252   442 Fold08.Rep3
## 253   414 Fold09.Rep3
## 254   369 Fold09.Rep3
## 255   404 Fold09.Rep3
## 256   443 Fold09.Rep3
## 257   410 Fold09.Rep3
## 258   427 Fold09.Rep3
## 259   463 Fold09.Rep3
## 260   417 Fold09.Rep3
## 261   448 Fold09.Rep3
## 262   424 Fold10.Rep3
## 263   350 Fold10.Rep3
## 264   408 Fold10.Rep3
## 265   463 Fold10.Rep3
## 266   409 Fold10.Rep3
## 267   435 Fold10.Rep3
## 268   489 Fold10.Rep3
## 269   414 Fold10.Rep3
## 270   468 Fold10.Rep3
## 271   409 Fold01.Rep4
## 272   333 Fold01.Rep4
## 273   405 Fold01.Rep4
## 274   454 Fold01.Rep4
## 275   417 Fold01.Rep4
## 276   430 Fold01.Rep4
## 277   462 Fold01.Rep4
## 278   419 Fold01.Rep4
## 279   450 Fold01.Rep4
## 280   392 Fold02.Rep4
## 281   289 Fold02.Rep4
## 282   378 Fold02.Rep4
## 283   433 Fold02.Rep4
## 284   383 Fold02.Rep4
## 285   407 Fold02.Rep4
## 286   449 Fold02.Rep4
## 287   395 Fold02.Rep4
## 288   433 Fold02.Rep4
## 289   420 Fold03.Rep4
## 290   338 Fold03.Rep4
## 291   414 Fold03.Rep4
## 292   460 Fold03.Rep4
## 293   414 Fold03.Rep4
## 294   441 Fold03.Rep4
## 295   474 Fold03.Rep4
## 296   430 Fold03.Rep4
## 297   464 Fold03.Rep4
## 298   410 Fold04.Rep4
## 299   354 Fold04.Rep4
## 300   386 Fold04.Rep4
## 301   443 Fold04.Rep4
## 302   400 Fold04.Rep4
## 303   428 Fold04.Rep4
## 304   464 Fold04.Rep4
## 305   408 Fold04.Rep4
## 306   450 Fold04.Rep4
## 307   414 Fold05.Rep4
## 308   327 Fold05.Rep4
## 309   395 Fold05.Rep4
## 310   445 Fold05.Rep4
## 311   403 Fold05.Rep4
## 312   427 Fold05.Rep4
## 313   456 Fold05.Rep4
## 314   414 Fold05.Rep4
## 315   445 Fold05.Rep4
## 316   432 Fold06.Rep4
## 317   339 Fold06.Rep4
## 318   412 Fold06.Rep4
## 319   465 Fold06.Rep4
## 320   417 Fold06.Rep4
## 321   452 Fold06.Rep4
## 322   468 Fold06.Rep4
## 323   427 Fold06.Rep4
## 324   463 Fold06.Rep4
## 325   423 Fold07.Rep4
## 326   346 Fold07.Rep4
## 327   412 Fold07.Rep4
## 328   447 Fold07.Rep4
## 329   417 Fold07.Rep4
## 330   439 Fold07.Rep4
## 331   461 Fold07.Rep4
## 332   428 Fold07.Rep4
## 333   451 Fold07.Rep4
## 334   396 Fold08.Rep4
## 335   303 Fold08.Rep4
## 336   380 Fold08.Rep4
## 337   443 Fold08.Rep4
## 338   390 Fold08.Rep4
## 339   409 Fold08.Rep4
## 340   452 Fold08.Rep4
## 341   400 Fold08.Rep4
## 342   438 Fold08.Rep4
## 343   420 Fold09.Rep4
## 344   346 Fold09.Rep4
## 345   410 Fold09.Rep4
## 346   441 Fold09.Rep4
## 347   409 Fold09.Rep4
## 348   423 Fold09.Rep4
## 349   457 Fold09.Rep4
## 350   416 Fold09.Rep4
## 351   441 Fold09.Rep4
## 352   401 Fold10.Rep4
## 353   332 Fold10.Rep4
## 354   387 Fold10.Rep4
## 355   438 Fold10.Rep4
## 356   391 Fold10.Rep4
## 357   418 Fold10.Rep4
## 358   462 Fold10.Rep4
## 359   401 Fold10.Rep4
## 360   440 Fold10.Rep4
## 361   414 Fold01.Rep5
## 362   333 Fold01.Rep5
## 363   400 Fold01.Rep5
## 364   438 Fold01.Rep5
## 365   402 Fold01.Rep5
## 366   421 Fold01.Rep5
## 367   465 Fold01.Rep5
## 368   409 Fold01.Rep5
## 369   439 Fold01.Rep5
## 370   425 Fold02.Rep5
## 371   344 Fold02.Rep5
## 372   418 Fold02.Rep5
## 373   459 Fold02.Rep5
## 374   430 Fold02.Rep5
## 375   448 Fold02.Rep5
## 376   480 Fold02.Rep5
## 377   439 Fold02.Rep5
## 378   464 Fold02.Rep5
## 379   404 Fold03.Rep5
## 380   319 Fold03.Rep5
## 381   389 Fold03.Rep5
## 382   433 Fold03.Rep5
## 383   390 Fold03.Rep5
## 384   415 Fold03.Rep5
## 385   458 Fold03.Rep5
## 386   407 Fold03.Rep5
## 387   445 Fold03.Rep5
## 388   419 Fold04.Rep5
## 389   330 Fold04.Rep5
## 390   405 Fold04.Rep5
## 391   448 Fold04.Rep5
## 392   408 Fold04.Rep5
## 393   433 Fold04.Rep5
## 394   468 Fold04.Rep5
## 395   420 Fold04.Rep5
## 396   452 Fold04.Rep5
## 397   393 Fold05.Rep5
## 398   321 Fold05.Rep5
## 399   378 Fold05.Rep5
## 400   420 Fold05.Rep5
## 401   390 Fold05.Rep5
## 402   410 Fold05.Rep5
## 403   440 Fold05.Rep5
## 404   396 Fold05.Rep5
## 405   424 Fold05.Rep5
## 406   386 Fold06.Rep5
## 407   302 Fold06.Rep5
## 408   372 Fold06.Rep5
## 409   425 Fold06.Rep5
## 410   382 Fold06.Rep5
## 411   400 Fold06.Rep5
## 412   442 Fold06.Rep5
## 413   384 Fold06.Rep5
## 414   431 Fold06.Rep5
## 415   432 Fold07.Rep5
## 416   350 Fold07.Rep5
## 417   425 Fold07.Rep5
## 418   463 Fold07.Rep5
## 419   426 Fold07.Rep5
## 420   443 Fold07.Rep5
## 421   480 Fold07.Rep5
## 422   432 Fold07.Rep5
## 423   464 Fold07.Rep5
## 424   407 Fold08.Rep5
## 425   328 Fold08.Rep5
## 426   385 Fold08.Rep5
## 427   444 Fold08.Rep5
## 428   397 Fold08.Rep5
## 429   431 Fold08.Rep5
## 430   470 Fold08.Rep5
## 431   410 Fold08.Rep5
## 432   451 Fold08.Rep5
## 433   405 Fold09.Rep5
## 434   330 Fold09.Rep5
## 435   388 Fold09.Rep5
## 436   443 Fold09.Rep5
## 437   397 Fold09.Rep5
## 438   421 Fold09.Rep5
## 439   458 Fold09.Rep5
## 440   406 Fold09.Rep5
## 441   444 Fold09.Rep5
## 442   439 Fold10.Rep5
## 443   356 Fold10.Rep5
## 444   427 Fold10.Rep5
## 445   472 Fold10.Rep5
## 446   425 Fold10.Rep5
## 447   457 Fold10.Rep5
## 448   483 Fold10.Rep5
## 449   439 Fold10.Rep5
## 450   474 Fold10.Rep5
boostingtrain$perfNames
## [1] "Accuracy" "Kappa"
#Optimal model
# boostingoptimal <- gbm(income ~., data = combined[1:32402, ], distribution = "bernoulli", n.trees = 150,
#                 interaction.depth = 3, shrinkage = 0.1)
# summary(boostingoptimal)
# varImp(boostingoptimal, numTrees = 150)

#Test error of the optimal model
# testerroroptimal <- c()
# thresh <- 0.5
# for(i in 1:15){
#   yhat <- predict(boostingtrain, newdata = combined[32403:48598, -44], n.trees = (10 * i), type = "prob")
#   yhat <- (yhat > thresh)
#   testerroroptimal[i] <- mean(yhat != combined[32403:48598, 44])
# }
# plot(testerroroptimal)



#ROC curve - testing
set.seed(100)
posopt <- c()
posopt <- predict(boostingtrain, newdata = combined[32403:48598, -44], n.trees = 150, type = "prob")
predictsopt <- prediction(posopt[, 2], combined[32403:48598, 44])
rocopt <- ROCR::performance(predictsopt, measure = "tpr", x.measure = "fpr")
plot(rocopt)
abline(0, 1, col = "red")

aucopt <- ROCR::performance(predictsopt, measure = "auc")
aucopt@y.values
## [[1]]
## [1] 0.9177315
#ROC and AUC combined testing 
plot(roc1, type = "l", col = "red")
par(new = TRUE)
plot(roc2, type = "l", col = "green")
par(new = TRUE)
plot(roc3, type = "l", col = "blue")
par(new = TRUE)
plot(roc4, type = "l", col = "black")
par(new = TRUE)
plot(roc5, type = "l", col = "yellow")
par(new = TRUE)
plot(rocopt, type = "l", col = "purple",
     main = "1: red, 2: green, 3: blue, 4: black, 5: yellow, trained: purple")

#Train error of the optimal model
# trainerroropt <- c()
# thresh <- 0.5
# for(i in 1:500){
#   yhat <- predict(boostingoptimal, newdata = combined[1:32402, -44], n.trees = (10 * i), type = "response")
#   yhat <- (yhat > thresh)
#   trainerroropt[i] <- mean(yhat != combined[1:32402, 44])
# }
# plot(trainerroropt)



#ROC curve - training
pos5opt <- c()
pos5opt <- predict(boostingtrain, newdata = combined[1:32402, -44], n.trees = 150, type = "prob")
predicts5opt <- prediction(pos5opt[, 2], combined[1:32402, 44])
roc5opt <- ROCR::performance(predicts5opt, measure = "tpr", x.measure = "fpr")
plot(roc5opt)
abline(0, 1, col = "red")

auc5opt <- ROCR::performance(predicts5opt, measure = "auc")
auc5opt@y.values
## [[1]]
## [1] 0.9215941
# boosting <- C50::C5.0(newtrain2[, -45], newtrain2[, 45], trials = 10) #boosting iteration = 10
# summary(boosting)
# 
# classes <- predict(boosting, newtest2[, -45], type = "class")
# table(classes, newtest2[, 45])
# 
# acc <- sum(classes == newtest2[, 45]) / length(newtest2[, 45])
# acc



# https://github.com/topepo/caret/blob/master/RegressionTests/Code/C5.0.R 
# 
# cctrl1 <- trainControl(method = "cv", number = 3, returnResamp = "all",
#                        classProbs = TRUE, 
#                        summaryFunction = twoClassSummary)
# cctrl2 <- trainControl(method = "LOOCV",
#                        classProbs = TRUE, summaryFunction = twoClassSummary)
# cctrl3 <- trainControl(method = "none",
#                        classProbs = TRUE, summaryFunction = twoClassSummary)
# cctrlR <- trainControl(method = "cv", number = 3, returnResamp = "all",
#                        classProbs = TRUE, 
#                        search = "random")
# 
# y <- as.numeric(newtrain2$income) - 1 
# test_class_cv_model <- train(newtrain2[, -45], y, 
#                               method = "C5.0", 
#                               trControl = cctrl1,
#                               metric = "ROC", 
#                               control = C50::C5.0Control(seed = 1),
#                               preProc = c("center", "scale"))

\(\\\)

\(\\\)

Threshold decision

We commented out, since it takes extremly long to run this chunk…

\(\\\)

\(\\\)

Model selection

set.seed(100)
thresh <- 0.5



a <- predict(boosting1, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
a1 <- (a > thresh)
a2 <- mean(a1 == combined[32403:48598, 44])



b <- predict(boosting2, newdata = combined[32403:48598, -44], n.trees = 2000, type = "response")
b1 <- (b > 0.3)
b2 <- mean(b1 == combined[32403:48598, 44])



c <- predict(boosting3, newdata = combined[32403:48598, -44], n.trees = 5000, type = "response")
c1 <- (c > thresh)
c2 <- mean(c1 == combined[32403:48598, 44])



d <- predict(boosting4, newdata = combined[32403:48598, -44], n.trees = 200, type = "response")
d1 <- (d > thresh)
d2 <- mean(d1 == combined[32403:48598, 44])



e <- predict(boosting5, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
e1 <- (e > thresh)
e2 <- mean(e1 == combined[32403:48598, 44])



f <- predict(boostingtrain, newdata = combined[32403:48598, -44], n.trees = 150, type = "raw")
f1 <- as.numeric(f) - 1
f2 <- mean(f1 == combined[32403:48598, 44])



a2
## [1] 0.8631761
b2
## [1] 0.835021
c2
## [1] 0.857187
d2
## [1] 0.8700296
e2
## [1] 0.8710175
f2
## [1] 0.8670042

\(\\\)

\(\\\)

Best model

final.auc4 <- boosting5

final.thres4 <- boosting4

Comment:

The model with the highest AUC and the highest testing set accuracy rate are the same!

Final Model Selection

set.seed(100)
newtrain2 <- read.csv("../data/cleandata/newtrain2.csv", header = T)
newtest2 <- read.csv("../data/cleandata/newtest2.csv", header = T)


#Change to binary digit
combined <- rbind(newtrain2, newtest2)
combined$income <- as.numeric(combined$income) - 1

Validate your best supervised classifier on the test set

We picked the best classifiers from classification tree, bagged tree, and random forest by using AUC and the accuracy rate from the best threshold for training dataset.

Best classifiers by AUC

set.seed(100)
#from classification 
final.auc1
## n= 32402 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 32402 7682 <=50K (0.762915869 0.237084131)  
##    2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)  
##      4) capital.gain< 7073.5 17274  849 <=50K (0.950850990 0.049149010)  
##        8) education.num< 12.5 13864  342 <=50K (0.975331795 0.024668205)  
##         16) capital.loss< 2218.5 13807  315 <=50K (0.977185486 0.022814514) *
##         17) capital.loss>=2218.5 57   27 <=50K (0.526315789 0.473684211)  
##           34) capital.loss>=3343.5 8    0 <=50K (1.000000000 0.000000000) *
##           35) capital.loss< 3343.5 49   22 >50K (0.448979592 0.551020408) *
##        9) education.num>=12.5 3410  507 <=50K (0.851319648 0.148680352) *
##      5) capital.gain>=7073.5 284   11 >50K (0.038732394 0.961267606)  
##       10) capital.gain>=30961.5 5    0 <=50K (1.000000000 0.000000000) *
##       11) capital.gain< 30961.5 279    6 >50K (0.021505376 0.978494624)  
##         22) capital.gain< 8296 19    6 >50K (0.315789474 0.684210526)  
##           44) education.num< 11.5 8    2 <=50K (0.750000000 0.250000000) *
##           45) education.num>=11.5 11    0 >50K (0.000000000 1.000000000) *
##         23) capital.gain>=8296 260    0 >50K (0.000000000 1.000000000) *
##    3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)  
##      6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)  
##       12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)  
##         24) education.num< 8.5 1656  167 <=50K (0.899154589 0.100845411) *
##         25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)  
##           50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341) *
##           51) capital.loss>=1782.5 335   83 >50K (0.247761194 0.752238806) *
##       13) capital.gain>=5095.5 496   11 >50K (0.022177419 0.977822581)  
##         26) capital.gain>=21045.5 2    0 <=50K (1.000000000 0.000000000) *
##         27) capital.gain< 21045.5 494    9 >50K (0.018218623 0.981781377) *
##      7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)  
##       14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)  
##         28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)  
##           56) hours.per.week< 31 306  112 <=50K (0.633986928 0.366013072) *
##           57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345) *
##         29) capital.loss>=1782.5 398   13 >50K (0.032663317 0.967336683) *
##       15) capital.gain>=5095.5 581    3 >50K (0.005163511 0.994836489) *
#Getting predicted >50K of income probabilities 
tree_prob <- predict(final.auc1, newdata = newtest2, type = "prob")[, 2]
tree_prediction <- prediction(tree_prob, newtest2$income)
tree_performance <- ROCR::performance(tree_prediction, measure = "tpr", x.measure = "fpr")



#Plot ROC curve 
plot(tree_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
tree.auc <- ROCR::performance(tree_prediction, measure="auc")@y.values[[1]]
tree.auc
## [1] 0.8768653
#==============================================================



#from bagged tree
final.auc2
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L) 
##                Type of random forest: classification
##                      Number of trees: 68
## No. of variables tried at each split: 43
## 
##         OOB estimate of  error rate: 13.66%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 22960 1760  0.07119741
## >50K   2666 5016  0.34704504
#Getting predicted >50K of income probabilities 
tunned.bag.rf_prob <- predict(final.auc2, newdata = newtest2,
                     type = "prob")[, 2]
tunned.bag.rf_prediction <- prediction(tunned.bag.rf_prob, newtest2$income)
tunned.bag.rf_performance <- ROCR::performance(tunned.bag.rf_prediction,
                                               measure = "tpr",
                                               x.measure = "fpr")



#Plot ROC curve 
plot(tunned.bag.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
tunned.bag.rf.auc <- ROCR::performance(tunned.bag.rf_prediction,
                                   measure = "auc")@y.values[[1]]
tunned.bag.rf.auc
## [1] 0.8942506
#==============================================================



#from random forest
final.auc3
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 79L, importance = TRUE, mtry = 8L, nodesize = 14L) 
##                Type of random forest: classification
##                      Number of trees: 79
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 13.47%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 23063 1657  0.06703074
## >50K   2709 4973  0.35264254
#Getting predicted >50K of income probabilities 
tunned.rf_prob <- predict(final.auc3, newdata = newtest2, 
                            type = "prob")[, 2]
tunned.rf_prediction <- prediction(tunned.rf_prob, newtest2$income)
tunned.rf_performance <- ROCR::performance(tunned.rf_prediction, measure = "tpr", x.measure = "fpr")



#Plot ROC curve 
plot(tunned.rf_performance, main = "ROC curve")
abline(a = 0, b = 1, lty = 2)

#Calculate AUC
tunned.rf.auc <- ROCR::performance(tunned.rf_prediction,
                                     measure = "auc")@y.values[[1]]
tunned.rf.auc
## [1] 0.8962369
#==============================================================



#from boosted
final.auc4
## gbm(formula = income ~ ., distribution = "bernoulli", data = combined[1:32402, 
##     ], n.trees = 5000, interaction.depth = 3, shrinkage = 0.1)
## A gradient boosted model with bernoulli loss function.
## 5000 iterations were performed.
## There were 43 predictors of which 42 had non-zero influence.
#ROC curve - testing
pos5 <- c()
pos5 <- predict(final.auc4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
predicts5 <- prediction(pos5, combined[32403:48598, 44])
roc5 <- ROCR::performance(predicts5, measure = "tpr", x.measure = "fpr")
plot(roc5, main = "ROC curve")
abline(0, 1, col = "red")

auc5 <- ROCR::performance(predicts5, measure = "auc")
auc5@y.values
## [[1]]
## [1] 0.9231948

\(\\\)

\(\\\)

Confusion matrix - AUC criterion

set.seed(100)
tree_class <- predict(final.auc1, newdata = newtest2, type = "class")
confusionMatrix(tree_class, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11859  1825
##      >50K    576  1936
##                                                
##                Accuracy : 0.8518               
##                  95% CI : (0.8462, 0.8572)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5298               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9537               
##             Specificity : 0.5148               
##          Pos Pred Value : 0.8666               
##          Neg Pred Value : 0.7707               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7322               
##    Detection Prevalence : 0.8449               
##       Balanced Accuracy : 0.7342               
##                                                
##        'Positive' Class : <=50K                
## 
#==============================================================



tunned.bag.rf_class <- predict(final.auc2, newdata = newtest2,
                     type = "class")
confusionMatrix(tunned.bag.rf_class, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11504  1340
##      >50K    931  2421
##                                                
##                Accuracy : 0.8598               
##                  95% CI : (0.8543, 0.8651)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5913               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9251               
##             Specificity : 0.6437               
##          Pos Pred Value : 0.8957               
##          Neg Pred Value : 0.7223               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7103               
##    Detection Prevalence : 0.7930               
##       Balanced Accuracy : 0.7844               
##                                                
##        'Positive' Class : <=50K                
## 
#==============================================================



tunned.rf_class <- predict(final.auc3, newdata = newtest2, 
                            type = "class")
confusionMatrix(tunned.rf_class, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11601  1362
##      >50K    834  2399
##                                                
##                Accuracy : 0.8644               
##                  95% CI : (0.859, 0.8696)      
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6002               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9329               
##             Specificity : 0.6379               
##          Pos Pred Value : 0.8949               
##          Neg Pred Value : 0.7420               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7163               
##    Detection Prevalence : 0.8004               
##       Balanced Accuracy : 0.7854               
##                                                
##        'Positive' Class : <=50K                
## 
#==============================================================



boosted_class <- predict.gbm(final.auc4, 
                             newdata = combined[32403:48598, -44],
                n.trees = 800, type = "response")
boosted_class <- ifelse(boosted_class > 0.5, ">50K", "<=50K")
confusionMatrix(boosted_class, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11703  1357
##      >50K    732  2404
##                                                
##                Accuracy : 0.871                
##                  95% CI : (0.8658, 0.8761)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.616                
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9411               
##             Specificity : 0.6392               
##          Pos Pred Value : 0.8961               
##          Neg Pred Value : 0.7666               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7226               
##    Detection Prevalence : 0.8064               
##       Balanced Accuracy : 0.7902               
##                                                
##        'Positive' Class : <=50K                
## 

\(\\\)

\(\\\)

Combine ROC curves

set.seed(100)
#Plot ROC curve 
plot(tree_performance, main="ROC curve", col = "blue")   # classification
plot(tunned.bag.rf_performance, add = T, col = "red")  # bagged
plot(tunned.rf_performance, add = T, col = "green") # random forest
plot(roc5, add = T) # boosted
abline(a = 0, b = 1, lty = 2)
legend("bottomright", legend = c("Classification", "Bagged",
                                 "Boosted","Random Forest"),
       col=c("blue", "red", "black", "green"), lwd=3, cex=.5, horiz = TRUE)

AUC.final <- data.frame(tree.auc, tunned.bag.rf.auc, tunned.rf.auc,
                        boosted.auc = auc5@y.values[[1]])



AUC.final[, order(AUC.final)]
##    tree.auc tunned.bag.rf.auc tunned.rf.auc boosted.auc
## 1 0.8768653         0.8942506     0.8962369   0.9231948

\(\\\)

\(\\\)

TPR v.s. TNR

set.seed(100)
#from classification 
final.auc1
## n= 32402 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 32402 7682 <=50K (0.762915869 0.237084131)  
##    2) Married.civ.spouse< 0.5 17558 1122 <=50K (0.936097505 0.063902495)  
##      4) capital.gain< 7073.5 17274  849 <=50K (0.950850990 0.049149010)  
##        8) education.num< 12.5 13864  342 <=50K (0.975331795 0.024668205)  
##         16) capital.loss< 2218.5 13807  315 <=50K (0.977185486 0.022814514) *
##         17) capital.loss>=2218.5 57   27 <=50K (0.526315789 0.473684211)  
##           34) capital.loss>=3343.5 8    0 <=50K (1.000000000 0.000000000) *
##           35) capital.loss< 3343.5 49   22 >50K (0.448979592 0.551020408) *
##        9) education.num>=12.5 3410  507 <=50K (0.851319648 0.148680352) *
##      5) capital.gain>=7073.5 284   11 >50K (0.038732394 0.961267606)  
##       10) capital.gain>=30961.5 5    0 <=50K (1.000000000 0.000000000) *
##       11) capital.gain< 30961.5 279    6 >50K (0.021505376 0.978494624)  
##         22) capital.gain< 8296 19    6 >50K (0.315789474 0.684210526)  
##           44) education.num< 11.5 8    2 <=50K (0.750000000 0.250000000) *
##           45) education.num>=11.5 11    0 >50K (0.000000000 1.000000000) *
##         23) capital.gain>=8296 260    0 >50K (0.000000000 1.000000000) *
##    3) Married.civ.spouse>=0.5 14844 6560 <=50K (0.558070601 0.441929399)  
##      6) education.num< 12.5 10475 3446 <=50K (0.671026253 0.328973747)  
##       12) capital.gain< 5095.5 9979 2961 <=50K (0.703276881 0.296723119)  
##         24) education.num< 8.5 1656  167 <=50K (0.899154589 0.100845411) *
##         25) education.num>=8.5 8323 2794 <=50K (0.664303737 0.335696263)  
##           50) capital.loss< 1782.5 7988 2542 <=50K (0.681772659 0.318227341) *
##           51) capital.loss>=1782.5 335   83 >50K (0.247761194 0.752238806) *
##       13) capital.gain>=5095.5 496   11 >50K (0.022177419 0.977822581)  
##         26) capital.gain>=21045.5 2    0 <=50K (1.000000000 0.000000000) *
##         27) capital.gain< 21045.5 494    9 >50K (0.018218623 0.981781377) *
##      7) education.num>=12.5 4369 1255 >50K (0.287251087 0.712748913)  
##       14) capital.gain< 5095.5 3788 1252 >50K (0.330517423 0.669482577)  
##         28) capital.loss< 1782.5 3390 1239 >50K (0.365486726 0.634513274)  
##           56) hours.per.week< 31 306  112 <=50K (0.633986928 0.366013072) *
##           57) hours.per.week>=31 3084 1045 >50K (0.338845655 0.661154345) *
##         29) capital.loss>=1782.5 398   13 >50K (0.032663317 0.967336683) *
##       15) capital.gain>=5095.5 581    3 >50K (0.005163511 0.994836489) *
#Getting predicted >50K of income probabilities 
tree_prob <- predict(final.auc1, newdata = newtest2, type = "prob")[, 2]
tree_prediction <- prediction(tree_prob, newtest2$income)
tree_performance <- ROCR::performance(tree_prediction, measure = "tpr", x.measure = "tnr")



#Plot ROC curve 
plot(tree_performance, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)

#==============================================================



#from bagged tree
final.auc2
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L) 
##                Type of random forest: classification
##                      Number of trees: 68
## No. of variables tried at each split: 43
## 
##         OOB estimate of  error rate: 13.66%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 22960 1760  0.07119741
## >50K   2666 5016  0.34704504
#Getting predicted >50K of income probabilities 
tunned.bag.rf_prob <- predict(final.auc2, newdata = newtest2,
                     type = "prob")[, 2]
tunned.bag.rf_prediction <- prediction(tunned.bag.rf_prob, newtest2$income)
tunned.bag.rf_performance <- ROCR::performance(tunned.bag.rf_prediction,
                                               measure = "tpr",
                                               x.measure = "tnr")



#Plot ROC curve 
plot(tunned.bag.rf_performance, main="TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)

#==============================================================



#from random forest
final.auc3
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 79L, importance = TRUE, mtry = 8L, nodesize = 14L) 
##                Type of random forest: classification
##                      Number of trees: 79
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 13.47%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 23063 1657  0.06703074
## >50K   2709 4973  0.35264254
#Getting predicted >50K of income probabilities 
tunned.rf_prob <- predict(final.auc3, newdata = newtest2, 
                            type = "prob")[, 2]
tunned.rf_prediction <- prediction(tunned.rf_prob, newtest2$income)
tunned.rf_performance <- ROCR::performance(tunned.rf_prediction, measure = "tpr", x.measure = "tnr")



#Plot ROC curve 
plot(tunned.rf_performance, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)

#==============================================================



#from boosted
final.auc4
## gbm(formula = income ~ ., distribution = "bernoulli", data = combined[1:32402, 
##     ], n.trees = 5000, interaction.depth = 3, shrinkage = 0.1)
## A gradient boosted model with bernoulli loss function.
## 5000 iterations were performed.
## There were 43 predictors of which 42 had non-zero influence.
#ROC curve - testing
pos5 <- c()
pos5 <- predict(final.auc4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
predicts5 <- prediction(pos5, combined[32403:48598, 44])
roc5 <- ROCR::performance(predicts5, measure = "tpr", x.measure = "tnr")
plot(roc5, main="TPR v.s. TNR")
abline(a = 1, b = -1, col = "red")

plot(tree_performance, main = "TPR v.s. TNR - AUC selection", col = "blue")
plot(tunned.bag.rf_performance, col = "red", add = TRUE)
plot(tunned.rf_performance, col = "green", add = TRUE)
plot(roc5, add = TRUE)
abline(a = 1, b = -1, lty = 2)
legend("bottomleft", legend = c("Classification", "Bagged",
                                 "Boosted","Random Forest"),
       col=c("blue", "red", "black", "green"), lwd=3, cex=.5, horiz = TRUE)

\(\\\)

\(\\\)

Best classifiers by accuracy with the optimal threshold

set.seed(100)
#from classification 
#final.thres1
info_prob <- predict(final.thres1.half, newdata = newtest2, type = "prob")[, 2]



#Test accuracy rate by using default cutoff 0.5
prunned.info.accuracy <- mean((info_prob > 0.5) == (newtest2$income == ">50K"))
cat("Accuracy classification :  ", prunned.info.accuracy, "\n")
## Accuracy classification :   0.8608916
#==============================================================




#from bagged tree 
#final.thres2  # bag.rforest$learner.model
tunned.bag.rf_prob <- predict(final.thres2.half, newdata = newtest2,
                     type = "prob")[, 2]



#Test accuracy rate by using default cutoff 0.5
tunned.bagged.accuracy <- mean((tunned.bag.rf_prob > 0.5) == (newtest2$income == ">50K"))
cat("Accuracy Bagged :  ", tunned.bagged.accuracy, "\n")
## Accuracy Bagged :   0.8620647
#==============================================================



#from random forest
#final.thres3  # untunned.forest$learner.model
untunned.rf_prob <- predict(final.thres3.half, newdata = newtest2,
                            type = "prob")[, 2]



#Test accuracy rate by using default cutoff 0.5
rf.untunned.accuracy <- mean((untunned.rf_prob > 0.5) == (newtest2$income == ">50K"))
cat("Accuracy Random Forest :  ", rf.untunned.accuracy, "\n")
## Accuracy Random Forest :   0.866078
#==============================================================



#from boosting
#final.thres4
e <- predict(final.thres4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
e1 <- (e > 0.5)
e2 <- mean(e1 == combined[32403:48598, 44])
cat("Accuracy Boosted :  ", e2, "\n")
## Accuracy Boosted :   0.8695357

Comment:

So our best classifier from accuracy criterion is Boosted tree.

\(\\\)

\(\\\)

Confusion matrix - Accuracy criterion

set.seed(100)
classification_class2 <- predict(final.thres1.half$finalModel, newdata = newtest2, type = "class")
confusionMatrix(classification_class2, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11767  1585
##      >50K    668  2176
##                                                
##                Accuracy : 0.8609               
##                  95% CI : (0.8555, 0.8662)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5736               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9463               
##             Specificity : 0.5786               
##          Pos Pred Value : 0.8813               
##          Neg Pred Value : 0.7651               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7265               
##    Detection Prevalence : 0.8244               
##       Balanced Accuracy : 0.7624               
##                                                
##        'Positive' Class : <=50K                
## 
#==============================================================



tunned.bag.rf_class2 <- predict(final.thres2.half, newdata = newtest2,
                     type = "class")
confusionMatrix(tunned.bag.rf_class2, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11504  1340
##      >50K    931  2421
##                                                
##                Accuracy : 0.8598               
##                  95% CI : (0.8543, 0.8651)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.5913               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9251               
##             Specificity : 0.6437               
##          Pos Pred Value : 0.8957               
##          Neg Pred Value : 0.7223               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7103               
##    Detection Prevalence : 0.7930               
##       Balanced Accuracy : 0.7844               
##                                                
##        'Positive' Class : <=50K                
## 
#==============================================================



untunned.rf_class2 <- predict(final.thres3.half, newdata = newtest2,
                            type = "class")
confusionMatrix(untunned.rf_class2, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11598  1359
##      >50K    837  2402
##                                                
##                Accuracy : 0.8644               
##                  95% CI : (0.859, 0.8696)      
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6004               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9327               
##             Specificity : 0.6387               
##          Pos Pred Value : 0.8951               
##          Neg Pred Value : 0.7416               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7161               
##    Detection Prevalence : 0.8000               
##       Balanced Accuracy : 0.7857               
##                                                
##        'Positive' Class : <=50K                
## 
#==============================================================



boosted_class2 <- predict(final.thres4, newdata = combined[32403:48598, -44], n.trees = 800, type = "response")
boosted_class2 <- ifelse(boosted_class2 > 0.5, ">50K", "<=50K")
confusionMatrix(boosted_class2, newtest2$income)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction <=50K  >50K
##      <=50K 11667  1345
##      >50K    768  2416
##                                                
##                Accuracy : 0.8695               
##                  95% CI : (0.8643, 0.8747)     
##     No Information Rate : 0.7678               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.6134               
##  Mcnemar's Test P-Value : < 0.00000000000000022
##                                                
##             Sensitivity : 0.9382               
##             Specificity : 0.6424               
##          Pos Pred Value : 0.8966               
##          Neg Pred Value : 0.7588               
##              Prevalence : 0.7678               
##          Detection Rate : 0.7204               
##    Detection Prevalence : 0.8034               
##       Balanced Accuracy : 0.7903               
##                                                
##        'Positive' Class : <=50K                
## 

\(\\\)

\(\\\)

TPR v.s. TNR

set.seed(100)
#from classification 
final.thres1.half
## CART 
## 
## 32402 samples
##    43 predictor
##     2 classes: '<=50K', '>50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 29162, 29161, 29162, 29162, 29162, 29162, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.001171570  0.8576940  0.5683314
##   0.001366832  0.8572001  0.5639238
##   0.002212965  0.8547208  0.5507302
##   0.002629524  0.8536818  0.5444061
##   0.003558101  0.8501429  0.5352532
##   0.006769071  0.8442687  0.5163148
##   0.010999740  0.8432194  0.5114075
##   0.034105702  0.8385798  0.4924548
##   0.061702682  0.8264613  0.4422631
##   0.120997136  0.7879256  0.1876654
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.00117157.
#Getting predicted >50K of income probabilities 
tree_prob2 <- predict(final.thres1.half, newdata = newtest2, 
                      type = "prob")[, 2]
tree_prediction2 <- prediction(tree_prob2, newtest2$income)
tree_performance2 <- ROCR::performance(tree_prediction2,
                                      measure = "tpr", x.measure = "tnr")



#Plot ROC curve 
plot(tree_performance2, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)

#==============================================================



#from bagged tree
final.thres2.half
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 68L, mtry = 43, importance = TRUE, nodesize = 43L) 
##                Type of random forest: classification
##                      Number of trees: 68
## No. of variables tried at each split: 43
## 
##         OOB estimate of  error rate: 13.66%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 22960 1760  0.07119741
## >50K   2666 5016  0.34704504
#Getting predicted >50K of income probabilities 
tunned.bag.rf_prob2 <- predict(final.thres2.half, newdata = newtest2,
                     type = "prob")[, 2]
tunned.bag.rf_prediction2 <- prediction(tunned.bag.rf_prob2, newtest2$income)
tunned.bag.rf_performance2 <- ROCR::performance(tunned.bag.rf_prediction2,
                                               measure = "tpr",
                                               x.measure = "tnr")



#Plot ROC curve 
plot(tunned.bag.rf_performance2, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)

#==============================================================



#from random forest
final.thres3.half
## 
## Call:
##  randomForest(formula = f, data = data, classwt = classwt, cutoff = cutoff,      ntree = 50L, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 13.57%
## Confusion matrix:
##       <=50K >50K class.error
## <=50K 23021 1699  0.06872977
## >50K   2698 4984  0.35121062
#Getting predicted >50K of income probabilities 
untunned.rf_prob3 <- predict(final.thres3.half, newdata = newtest2, 
                            type = "prob")[, 2]
untunned.rf_prediction3 <- prediction(untunned.rf_prob3, newtest2$income)
untunned.rf_performance3 <- ROCR::performance(untunned.rf_prediction3,
                                             measure = "tpr", x.measure = "tnr")



#Plot ROC curve 
plot(untunned.rf_performance3, main = "TPR v.s. TNR")
abline(a = 1, b = -1, lty = 2)

#==============================================================



#from boosted
final.thres4
## gbm(formula = income ~ ., distribution = "bernoulli", data = combined[1:32402, 
##     ], n.trees = 5000, interaction.depth = 3, shrinkage = 0.2)
## A gradient boosted model with bernoulli loss function.
## 5000 iterations were performed.
## There were 43 predictors of which 42 had non-zero influence.
#ROC curve - testing
pos5b <- c()
pos5b <- predict(final.thres4, newdata = combined[32403:48598, -44], n.trees = 800,
                type = "response")
predicts5b <- prediction(pos5b, combined[32403:48598, 44])
roc5b <- ROCR::performance(predicts5b, measure = "tpr", x.measure = "tnr")
plot(roc5b, main = "TPR v.s. TNR")
abline(a = 1, b = -1, col = "red")

#Combine into one graph
plot(tree_performance2, main = "TPR v.s. TNR - Accuracy selection", 
     col = "blue")
plot(tunned.bag.rf_performance2, col = "red", add = TRUE)
plot(untunned.rf_performance3, col = "green", add = TRUE)
plot(roc5b, add = TRUE)
abline(a = 1, b = -1, lty = 2)
legend("bottomleft", legend = c("Classification", "Bagged",
                                 "Boosted","Random Forest"),
       col=c("blue", "red", "black", "green"), lwd=3, cex=.5, horiz = TRUE)